Skip to content

Commit

Permalink
Added OpenCL metrics query sample, update documentation, update SOFTWARE
Browse files Browse the repository at this point in the history
  • Loading branch information
anton-v-gorshkov committed Oct 1, 2021
1 parent 07c881d commit 9023c9d
Show file tree
Hide file tree
Showing 27 changed files with 1,013 additions and 122 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ You may obtain a copy of the License at https://opensource.org/licenses/MIT
- for [OpenCL(TM)](chapters/binary_source_correlation/OpenCL.md)
- for [oneAPI Level Zero (Level Zero)](chapters/binary_source_correlation/LevelZero.md)
4. Metrics Collection
- for [oneAPI Level Zero (Level Zero)](chapters/metrics_collection/LevelZero.md)
- based on [oneAPI Level Zero (Level Zero) Metric API](chapters/metrics_collection/LevelZero.md)
- based on [Intel(R) Metrics Discovery Application Programming Interface](chapters/metrics_collection/MetricsDiscoveryAPI.md)
- based on [Performance Monitoring (PM) Register](chapters/metrics_collection/PerfMonReg.md)
5. Binary Instrumentation
Expand All @@ -58,7 +58,8 @@ You may obtain a copy of the License at https://opensource.org/licenses/MIT
- [cl_hot_functions](samples/cl_hot_functions) - provides a list of hottest OpenCL(TM) API calls by backend (CPU and GPU);
- [cl_hot_kernels](samples/cl_hot_kernels) - provides a list of hottest OpenCL(TM) kernels by backend (CPU and GPU);
- [cl_debug_info](samples/cl_debug_info) - prints source and assembly (GEN ISA) for kernels on GPU;
- [cl_gpu_metrics](samples/cl_gpu_metrics) - provides a list of hottest OpenCL(TM) GPU kernels along with percent of cycles it was active, stall and idle;
- [cl_gpu_metrics](samples/cl_gpu_metrics) - provides a list of hottest OpenCL(TM) GPU kernels along with percent of cycles it was active, stall and idle (based on continuous metrics collection mode);
- [cl_gpu_query](samples/cl_gpu_query) - provides a list of hottest OpenCL(TM) GPU kernels along with percent of cycles it was active, stall and idle (based on query metrics collection mode);
- tools for Level Zero, DPC++ (with Level Zero backend) and OpenMP* GPU offload (with Level Zero backend):
- [ze_hot_functions](samples/ze_hot_functions) - provides a list of hottest Level Zero API calls;
- [ze_hot_kernels](samples/ze_hot_kernels) - provides a list of hottest Level Zero kernels;
Expand Down
14 changes: 7 additions & 7 deletions SOFTWARE
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ level-zero-loader:
- https://github.com/oneapi-src/level-zero/releases/download/v1.4.1/level-zero-devel_1.4.1+u18.04_amd64.deb
compute-runtime:
deb:
- https://github.com/intel/compute-runtime/releases/download/21.31.20514/intel-gmmlib_21.2.1_amd64.deb
- https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.8173/intel-igc-core_1.0.8173_amd64.deb
- https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.8173/intel-igc-opencl_1.0.8173_amd64.deb
- https://github.com/intel/compute-runtime/releases/download/21.31.20514/intel-opencl_21.31.20514_amd64.deb
- https://github.com/intel/compute-runtime/releases/download/21.31.20514/intel-ocloc_21.31.20514_amd64.deb
- https://github.com/intel/compute-runtime/releases/download/21.31.20514/intel-level-zero-gpu_1.1.20514_amd64.deb
- https://github.com/intel/compute-runtime/releases/download/21.38.21026/intel-gmmlib_21.2.1_amd64.deb
- https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.8708/intel-igc-core_1.0.8708_amd64.deb
- https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.8708/intel-igc-opencl_1.0.8708_amd64.deb
- https://github.com/intel/compute-runtime/releases/download/21.38.21026/intel-opencl_21.38.21026_amd64.deb
- https://github.com/intel/compute-runtime/releases/download/21.38.21026/intel-ocloc_21.38.21026_amd64.deb
- https://github.com/intel/compute-runtime/releases/download/21.38.21026/intel-level-zero-gpu_1.2.21026_amd64.deb
metrics-discovery:
github:
build_path: build
Expand Down Expand Up @@ -43,7 +43,7 @@ oneapit-toolkits:
apt_repo: deb https://apt.repos.intel.com/oneapi all main
apt_source: /etc/apt/sources.list.d/oneAPI.list
packages:
- intel-basekit=2021.3.0-3219
- intel-basekit=2021.4.0-3422
finalize:
config:
commands:
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.37.4
0.38.0
10 changes: 8 additions & 2 deletions chapters/device_activity_tracing/LevelZero.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,16 @@ ze_result_t status = zeDeviceGetGlobalTimestamps(
device, &host_timestamp, &device_timestamp);
assert(status == ZE_RESULT_SUCCESS);
```
Note, that host timestamp value corresponds to `CLOCK_MONOTONIC_RAW` on Linux or `QueryPerformanceCounter` on Windows, while device timestamp for GPU is collected in raw GPU cycles and it's low 32 bits are the same as kernel or metric timestamps (kernel and metric timestamps in Level Zero limited to 32 bits for now).
Note, that host timestamp value corresponds to `CLOCK_MONOTONIC_RAW` on Linux or `QueryPerformanceCounter` on Windows, while device timestamp for GPU is collected in raw GPU cycles. Also note that not all bits of device timestamp are valid, to get exact number of valid bits use `timestampValidBits` field from `ze_device_properties_t` structure, e.g.:
```cpp
uint64_t kernel_timestamp = (device_timestamp & 0x0FFFFFFFF);
ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, };
ze_result_t status = zeDeviceGetProperties(device, &props);
assert(status == ZE_RESULT_SUCCESS);
uint64_t mask = (1ull << props.kernelTimestampValidBits) - 1ull;
uint64_t kernel_timestamp = (device_timestamp & mask);
```
The same valid bits mask should be applied to `global` kernel timestamps.

To convert GPU cycles into seconds one may use `timerResolution` field from `ze_device_properties_t` structure, that represents cycles per second starting from Level Zero 1.1:
```cpp
Expand Down
16 changes: 15 additions & 1 deletion chapters/device_activity_tracing/OpenCL.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,25 @@ void CL_CALLBACK EventNotify(cl_event event,
assert(status == CL_SUCCESS);
}
```
## Time Correlation
It's commonly needed to map OpenCL kernel timestamps to general CPU timeline. To solve this problem one should use `clGetDeviceAndHostTimer` function to get time sync point between host and device:
```cpp
cl_ulong device_timestamp = 0, host_timestamp = 0;
cl_int status = clGetDeviceAndHostTimer(
device, device_timestamp, host_timestamp);
assert(status == CL_SUCCESS)
```
Note, that host timestamp in Intel(R) Graphics Compute Runtime for oneAPI Level Zero and OpenCL(TM) Driver is based on `CLOCK_MONOTONIC_RAW` on Linux and `QueryPerformanceCounter` on Windows (implementation specific, may be changed in future). Both timers are in nanoseconds.
## Usage Details
- refer to the documentation for the function [clGetEventProfilingInfo](https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetEventProfilingInfo.html) to learn more on OpenCL(TM) profiling
## Samples
- [OpenCL(TM) GEMM](../../samples/cl_gemm)
- [OpenCL(TM) Hot Kernels](../../samples/cl_hot_kernels)
- [OpenCL(TM) GPU Metrics](../../samples/cl_gpu_metrics)
- [OpenCL(TM) GPU Metrics](../../samples/cl_gpu_metrics)
## Tools
- [OpenCL(TM) Tracer](../../tools/cl_tracer)
- [Tracing and Profiling Tool for Data Parallel C++ (DPC++)](../../tools/onetrace)
- [GPU Metrics Collection Tool for Data Parallel C++ (DPC++)](../../tools/oneprof)
10 changes: 10 additions & 0 deletions chapters/metrics_collection/LevelZero.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ assert(status == ZE_RESULT_SUCCESS);
```
### Time Correlation
#### Level Zero Kernels
To map metric report to some particular device activity (e.g. kernel execution) one need to correlate report timestamp (metric with name `QueryBeginTime` inside a report) to kernel `global` timestamps retrieved with the help of [device activity tracing](../device_activity_tracing/LevelZero.md).
The difference between these two timestamps is that kernel time is in GPU clocks, and `QueryBeginTime` is in nanoseconds. To convert clocks to nanoseconds one need to know GPU timer frequency and use the following formula:
Expand All @@ -292,6 +293,15 @@ gpuTimestampNs = gpuTimestampClocks * NS_IN_SEC / gpuTimerFrequency
```
Starting from version 1.1, Level Zero provides this value as `timerResolution` field of `ze_device_properties_t` structure in cycles per second. Also it can be retrieved with the help of Intel(R) Metrics Discovery Application Programming Interface as part of device information as `GpuTimestampFrequency` symbol (look into "Device Information" section from [here](./MetricsDiscoveryAPI.md) for details).
Also note that not all bits in `global` kernel timestamp value may be valid, to get exact number of valid bits use `timestampValidBits` field from `ze_device_properties_t` structure.
#### OpenCL(TM) Kernels
Common stragety of metrics to kernel mapping for OpenCL(TM) kernels may be the following:
1. Collect kernel timestamps based on [OpenCL(TM) device activity tracing](../device_activity_tracing/OpenCL.md) mechanism;
2. Convert device timestamps into host timestamps with the help of `clGetDeviceAndHostTimer` function (Time Correlation section [here](../device_activity_tracing/OpenCL.md));
3. Convert host timestamps into Level Zero kernel timestamps with the help of `zeDeviceGetGlobalTimestamps` function (Time Correlation section [here](../device_activity_tracing/LevelZero.md));
4. Use the approach described for Level Zero kernels (above).
## Build and Run
To make metrics collection work one need to link the application with Level Zero ICD library (e.g. `libze_loader.so`) and run it as following:
```
Expand Down
87 changes: 50 additions & 37 deletions chapters/metrics_collection/MetricsDiscoveryAPI.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ for (uint32_t gid = 0; gid < device->GetParams()->ConcurrentGroupsCount; ++gid)
}
```

### Collection
### Continuous Collection
Process of metrics collection with Intel(R) Metrics Discovery Application Programming Interface assumes that there is an infinite loop in a seprate thread, where one asks for collected samples periodically, read the data for a chunk of samples and store them into some memory or file (one sample contains all the metics and information items from a metric set).

First, one should set sampling interval for collection (in nanoseconds) and determine the size of the buffer with raw results (in bytes).
Expand Down Expand Up @@ -231,57 +231,69 @@ assert(status == md::CC_OK);
calculated_reports.resize(calculated_report_count * calculated_report_size);
```

### Time Correlation With OpenCL(TM)
### Time Correlation
It's often needed to map collected hardware metrics to a kernel in terms of time intervals.

Each metric set contains a special *information* item called `QueryBeginTime` that represents a timestamp (in nanoseconds) for a sample. At the same time one can collect kernel execution intervals using ***Device Activity Tracing*** capabilities. So to map exact sample to the kernel invocation, one need just to check if sample timestamp is in between of kernel start and end timestamps.

The problem is that metrics timestamp one can get with Intel(R) Metrics Discovery Application Programming Interface and kernel timestamps one can get e.g. with OpenCL(TM) are different and can't be compared directly - so one has to convert them to a single time format first.
The problem is that metrics timestamp one can get with Intel(R) Metrics Discovery Application Programming Interface and kernel timestamps one can get e.g. with OpenCL(TM) or with oneAPI Level Zero (Level Zero) are different and can't be compared directly - so one has to convert them to a single time format first.

In Intel(R) Metrics Discovery Application Programming Interface library there is a function `GetGpuCpuTimestamps` that allows to bind GPU metrics timestamp to some CPU timestamp (which is based on `CLOCK_MONOTONIC` on Linux and `QueryPerformanceCounter` on Windows).

So e.g. to convert GPU metrics timestamp (`gpuTimestamp`) to OpenCL GPU timestamp (`cpu_timestamp`), which is based on `CLOCK_MONOTONIC_RAW` on Linux, one should perform the following steps:
1. Get "time snap point" to correlate GPU and `CLOCK_MONOTONIC` time:
So the common strategy of metrics to kernels mapping is the following:
1. Convert `QueryBeginTime` into CPU timestamp with the help of `GetGpuCpuTimestamps` of Intel(R) Metrics Discovery Application Programming Interface library;
2. Convert kernel timestamp into host timestamp:
- for OpenCL(TM) - with the help of `clGetDeviceAndHostTimer` function (Time Correlation section [here](../device_activity_tracing/OpenCL.md));
- for oneAPI Level Zero (Level Zero) - with the help of `zeDeviceGetGlobalTimestamps` function (Time Correlation section [here](../device_activity_tracing/LevelZero.md));
- on Linux one may need to convert `CLOCK_MONOTONIC_RAW` into `CLOCK_MONOTONIC` to use the same time units;
3. Compare directly metic CPU timestamp with kernel host start and host end timestamps to perform metrics to kernel correlation.

### Query-Based Collection for OpenCL(TM)
An alternative approach could be to collect a single aggregated metric report per each kernel invocation. In some sense such a way may be easier than time-based collection, since one don't need to worry about time correlation (report is already for the kernel) and to deal with separate thread (runtime takes most of the responsibilities on data collection), but from the other hand one will get the only aggregated report per kernel (that may be not enough to analyse over time kernel behaviour). Also such approach is limited to support only specific runtimes (e.g. OpenCL(TM)).

To enable query-based mertrics collection for OpenCL(TM) one should perform the following steps:
1. Create MD device and choose target metric set (as described above);
2. Set API filtering mode to OpenCL(TM) and activate metric set:
```cpp
uint64_t cpu_snap_point = 0, gpu_snap_point = 0;
status = device->GetGpuCpuTimestamps(
&gpu_snap_point, &cpu_snap_point, nullptr);
md::TCompletionCode status = set_->SetApiFiltering(
md::API_TYPE_OCL | md::API_TYPE_OGL4_X);
assert(status == md::CC_OK);
status = set_->Activate();
assert(status == md::CC_OK);
```
2. Calculate `CLOCK_MONOTONIC` time for `gpuTimestamp`:
3. To be able to retrieve metrics for a kernel, one need to create a specific command queue with the help of extension. The argument `configuration` here could be obtained from the target metric set. Note, that `CL_QUEUE_PROFILING_ENABLE` property is required for such a queue:
```cpp
if (gpuTimestamp > gpu_snap_point) {
cpu_timestamp = cpu_snap_point + (gpuTimestamp - gpu_snap_point);
} else {
cpu_timestamp = cpu_snap_point - (gpu_snap_point - gpuTimestamp);
}
cl_command_queue CL_API_CALL
clCreatePerfCountersCommandQueueINTEL(
cl_context context,
cl_device_id device,
cl_command_queue_properties properties,
cl_uint configuration,
cl_int *errcodeRet);
// ...
cl_uint configuration = set_->GetParams()->ApiSpecificId.OCL;
```
3. Convert `CLOCK_MONOTONIC` to `CLOCK_MONOTONIC_RAW`:
4. Metric report for specific kernel could be retrieved as event profiling info:
```cpp
uint64_t ConvertClockMonotonicToRaw(uint64_t clock_monotonic) {
timespec monotonic_time;
timespec raw_time;
int status = 0;

status = clock_gettime(CLOCK_MONOTONIC, &monotonic_time);
assert(status == 0);
status = clock_gettime(CLOCK_MONOTONIC_RAW, &raw_time);
assert(status == 0);

uint64_t raw = raw_time.tv_nsec + NSEC_IN_SEC * raw_time.tv_sec;
uint64_t monotonic = monotonic_time.tv_nsec +
NSEC_IN_SEC * monotonic_time.tv_sec;
if (raw > monotonic) {
return clock_monotonic + (raw - monotonic);
} else {
return clock_monotonic - (monotonic - raw);
}
}
// ...
#define CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL 0x407F

size_t report_size = set_->GetParams()->QueryReportSize;
PTI_ASSERT(report_size > 0);

cpu_timestamp = ConvertClockMonotonicToRaw(cpu_timestamp);
std::vector<uint8_t> report(report_size, 0);
size_t output_size = 0;
cl_int status = clGetEventProfilingInfo(
event, CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL,
report_size, report.data(), &output_size);
assert(status == CL_SUCCESS);
```
5. Report data grabbed from `clGetEventProfilingInfo` should be calculated into metrics (described above). There will be a single metric report.
6. To finalize data collection one should deactive target metric set and remove MD device:
```cpp
md::TCompletionCode status = set_->Deactivate();
assert(status == md::CC_OK);
```
After that one can directly compare this `cpu_timestamp` with kernel start and end timestamps to perform metrics to kernel correlation.
Query-based metrics collection for Level Zero is described [here](./LevelZero.md).

## Build and Run
Since Intel(R) Metrics Discovery Application Programming Interface library is loaded dynamically at runtime, there is no need in any special build/run options. Just make sure Intel(R) Metrics Discovery Application Programming Interface library can be found correctly:
Expand All @@ -296,6 +308,7 @@ LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_libmd.so> ./<application>

## Samples
- [GPU Metrics for OpenCL(TM)](../../samples/cl_gpu_metrics)
- [GPU Query for OpenCL(TM)](../../samples/cl_gpu_query)

## Tools
- [GPU Info](../../tools/gpuinfo)
2 changes: 1 addition & 1 deletion samples/cl_debug_info/cl_debug_info_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ class ClDebugInfoCollector {
"for target device" << std::endl;
if (tracer != nullptr) {
delete tracer;
delete collector;
}
delete collector;
return nullptr;
}

Expand Down
1 change: 1 addition & 0 deletions samples/cl_gpu_metrics/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ SetBuildType()

# Tool Library

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPTI_KERNEL_INTERVALS=1")
add_library(clt_gpu_metrics SHARED
"${PROJECT_SOURCE_DIR}/../../loader/init.cc"
tool.cc)
Expand Down
4 changes: 1 addition & 3 deletions samples/cl_gpu_metrics/cl_metric_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@ enum CollectorState {

class ClMetricCollector {
public:
static ClMetricCollector* Create(
cl_device_id device, const char* set_name) {
PTI_ASSERT(device != nullptr);
static ClMetricCollector* Create(const char* set_name) {
PTI_ASSERT(set_name != nullptr);

std::string device_string = utils::GetEnv("PTI_DEVICE_ID");
Expand Down
2 changes: 1 addition & 1 deletion samples/cl_gpu_metrics/tool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ void EnableProfiling() {
return;
}

metric_collector = ClMetricCollector::Create(device, "ComputeBasic");
metric_collector = ClMetricCollector::Create("ComputeBasic");
if (metric_collector == nullptr) {
kernel_collector->DisableTracing();
delete kernel_collector;
Expand Down
39 changes: 39 additions & 0 deletions samples/cl_gpu_query/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
include("../../build_utils/CMakeLists.txt")
SetRequiredCMakeVersion()
cmake_minimum_required(VERSION ${REQUIRED_CMAKE_VERSION})

project(PTI_Samples_OpenCL_GPU_Query CXX)
SetCompilerFlags()
SetBuildType()

# Tool Library

add_library(clt_gpu_query SHARED
"${PROJECT_SOURCE_DIR}/../../loader/init.cc"
"${PROJECT_SOURCE_DIR}/../../utils/trace_guard.cc"
tool.cc)
target_include_directories(clt_gpu_query
PRIVATE "${PROJECT_SOURCE_DIR}/../../utils")
if(CMAKE_INCLUDE_PATH)
target_include_directories(clt_gpu_query
PUBLIC "${CMAKE_INCLUDE_PATH}")
endif()

FindOpenCLLibrary(clt_gpu_query)
FindOpenCLHeaders(clt_gpu_query)

GetOpenCLTracingHeaders(clt_gpu_query)

GetMDHeaders(clt_gpu_query)
CheckForMDLibrary(clt_gpu_query)

# Loader

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTOOL_NAME=clt_gpu_query")
add_executable(cl_gpu_query "${PROJECT_SOURCE_DIR}/../../loader/loader.cc")
target_include_directories(cl_gpu_query
PRIVATE "${PROJECT_SOURCE_DIR}/../../utils")
if(UNIX)
target_link_libraries(cl_gpu_query
dl)
endif()
Loading

0 comments on commit 9023c9d

Please sign in to comment.