Added OpenCL metrics query sample, update documentation, update SOFTWARE

jfedorov · Oct 1, 2021 · 9023c9d · 9023c9d
1 parent 07c881d
commit 9023c9d
Show file tree

Hide file tree

Showing 27 changed files with 1,013 additions and 122 deletions.
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ You may obtain a copy of the License at https://opensource.org/licenses/MIT
     - for [OpenCL(TM)](chapters/binary_source_correlation/OpenCL.md)
     - for [oneAPI Level Zero (Level Zero)](chapters/binary_source_correlation/LevelZero.md)
 4. Metrics Collection
-    - for [oneAPI Level Zero (Level Zero)](chapters/metrics_collection/LevelZero.md)
+    - based on [oneAPI Level Zero (Level Zero) Metric API](chapters/metrics_collection/LevelZero.md)
     - based on [Intel(R) Metrics Discovery Application Programming Interface](chapters/metrics_collection/MetricsDiscoveryAPI.md)
     - based on [Performance Monitoring (PM) Register](chapters/metrics_collection/PerfMonReg.md)
 5. Binary Instrumentation
@@ -58,7 +58,8 @@ You may obtain a copy of the License at https://opensource.org/licenses/MIT
     - [cl_hot_functions](samples/cl_hot_functions) - provides a list of hottest OpenCL(TM) API calls by backend (CPU and GPU);
     - [cl_hot_kernels](samples/cl_hot_kernels) - provides a list of hottest OpenCL(TM) kernels by backend (CPU and GPU);
     - [cl_debug_info](samples/cl_debug_info) - prints source and assembly (GEN ISA) for kernels on GPU;
-    - [cl_gpu_metrics](samples/cl_gpu_metrics) - provides a list of hottest OpenCL(TM) GPU kernels along with percent of cycles it was active, stall and idle;
+    - [cl_gpu_metrics](samples/cl_gpu_metrics) - provides a list of hottest OpenCL(TM) GPU kernels along with percent of cycles it was active, stall and idle (based on continuous  metrics collection mode);
+    - [cl_gpu_query](samples/cl_gpu_query) - provides a list of hottest OpenCL(TM) GPU kernels along with percent of cycles it was active, stall and idle (based on query  metrics collection mode);
 - tools for Level Zero, DPC++ (with Level Zero backend) and OpenMP* GPU offload (with Level Zero backend):
     - [ze_hot_functions](samples/ze_hot_functions) - provides a list of hottest Level Zero API calls;
     - [ze_hot_kernels](samples/ze_hot_kernels) - provides a list of hottest Level Zero kernels;

diff --git a/SOFTWARE b/SOFTWARE
@@ -4,12 +4,12 @@ level-zero-loader:
   - https://github.com/oneapi-src/level-zero/releases/download/v1.4.1/level-zero-devel_1.4.1+u18.04_amd64.deb
 compute-runtime:
   deb:
-  - https://github.com/intel/compute-runtime/releases/download/21.31.20514/intel-gmmlib_21.2.1_amd64.deb
-  - https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.8173/intel-igc-core_1.0.8173_amd64.deb
-  - https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.8173/intel-igc-opencl_1.0.8173_amd64.deb
-  - https://github.com/intel/compute-runtime/releases/download/21.31.20514/intel-opencl_21.31.20514_amd64.deb
-  - https://github.com/intel/compute-runtime/releases/download/21.31.20514/intel-ocloc_21.31.20514_amd64.deb
-  - https://github.com/intel/compute-runtime/releases/download/21.31.20514/intel-level-zero-gpu_1.1.20514_amd64.deb
+  - https://github.com/intel/compute-runtime/releases/download/21.38.21026/intel-gmmlib_21.2.1_amd64.deb
+  - https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.8708/intel-igc-core_1.0.8708_amd64.deb
+  - https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.8708/intel-igc-opencl_1.0.8708_amd64.deb
+  - https://github.com/intel/compute-runtime/releases/download/21.38.21026/intel-opencl_21.38.21026_amd64.deb
+  - https://github.com/intel/compute-runtime/releases/download/21.38.21026/intel-ocloc_21.38.21026_amd64.deb
+  - https://github.com/intel/compute-runtime/releases/download/21.38.21026/intel-level-zero-gpu_1.2.21026_amd64.deb
 metrics-discovery:
   github:
     build_path: build
@@ -43,7 +43,7 @@ oneapit-toolkits:
     apt_repo: deb https://apt.repos.intel.com/oneapi all main
     apt_source: /etc/apt/sources.list.d/oneAPI.list
     packages:
-    - intel-basekit=2021.3.0-3219
+    - intel-basekit=2021.4.0-3422
 finalize:
   config:
     commands:

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.37.4
+0.38.0
diff --git a/chapters/device_activity_tracing/LevelZero.md b/chapters/device_activity_tracing/LevelZero.md
@@ -81,10 +81,16 @@ ze_result_t status = zeDeviceGetGlobalTimestamps(
     device, &host_timestamp, &device_timestamp);
 assert(status == ZE_RESULT_SUCCESS);
 ```
-Note, that host timestamp value corresponds to `CLOCK_MONOTONIC_RAW` on Linux or `QueryPerformanceCounter` on Windows, while device timestamp for GPU is collected in raw GPU cycles and it's low 32 bits are the same as kernel or metric timestamps (kernel and metric timestamps in Level Zero limited to 32 bits for now).
+Note, that host timestamp value corresponds to `CLOCK_MONOTONIC_RAW` on Linux or `QueryPerformanceCounter` on Windows, while device timestamp for GPU is collected in raw GPU cycles. Also note that not all bits of device timestamp are valid, to get exact number of valid bits use `timestampValidBits` field from `ze_device_properties_t` structure, e.g.:
 ```cpp
-uint64_t kernel_timestamp = (device_timestamp & 0x0FFFFFFFF);
+ze_device_properties_t props{ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, };
+ze_result_t status = zeDeviceGetProperties(device, &props);
+assert(status == ZE_RESULT_SUCCESS);
+
+uint64_t mask = (1ull << props.kernelTimestampValidBits) - 1ull;
+uint64_t kernel_timestamp = (device_timestamp & mask);
 ```
+The same valid bits mask should be applied to `global` kernel timestamps.
 
 To convert GPU cycles into seconds one may use `timerResolution` field from `ze_device_properties_t` structure, that represents cycles per second starting from Level Zero 1.1:
 ```cpp

diff --git a/chapters/device_activity_tracing/OpenCL.md b/chapters/device_activity_tracing/OpenCL.md
@@ -69,11 +69,25 @@ void CL_CALLBACK EventNotify(cl_event event,
     assert(status == CL_SUCCESS);
 }
 ```
+## Time Correlation
+It's commonly needed to map OpenCL kernel timestamps to general CPU timeline. To solve this problem one should use `clGetDeviceAndHostTimer` function to get time sync point between host and device:
+```cpp
+cl_ulong device_timestamp = 0, host_timestamp = 0;
+cl_int status = clGetDeviceAndHostTimer(
+    device, device_timestamp, host_timestamp);
+assert(status == CL_SUCCESS)
+```
+Note, that host timestamp in Intel(R) Graphics Compute Runtime for oneAPI Level Zero and OpenCL(TM) Driver is based on `CLOCK_MONOTONIC_RAW` on Linux and `QueryPerformanceCounter` on Windows (implementation specific, may be changed in future). Both timers are in nanoseconds.
 
 ## Usage Details
 - refer to the documentation for the function [clGetEventProfilingInfo](https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetEventProfilingInfo.html) to learn more on OpenCL(TM) profiling
 
 ## Samples
 - [OpenCL(TM) GEMM](../../samples/cl_gemm)
 - [OpenCL(TM) Hot Kernels](../../samples/cl_hot_kernels)
-- [OpenCL(TM) GPU Metrics](../../samples/cl_gpu_metrics)
+- [OpenCL(TM) GPU Metrics](../../samples/cl_gpu_metrics)
+
+## Tools
+- [OpenCL(TM) Tracer](../../tools/cl_tracer)
+- [Tracing and Profiling Tool for Data Parallel C++ (DPC++)](../../tools/onetrace)
+- [GPU Metrics Collection Tool for Data Parallel C++ (DPC++)](../../tools/oneprof)
diff --git a/chapters/metrics_collection/LevelZero.md b/chapters/metrics_collection/LevelZero.md
@@ -283,6 +283,7 @@ assert(status == ZE_RESULT_SUCCESS);
 ```
 
 ### Time Correlation
+#### Level Zero Kernels
 To map metric report to some particular device activity (e.g. kernel execution) one need to correlate report timestamp (metric with name `QueryBeginTime` inside a report) to kernel `global` timestamps retrieved with the help of [device activity tracing](../device_activity_tracing/LevelZero.md).
 
 The difference between these two timestamps is that kernel time is in GPU clocks, and `QueryBeginTime` is in nanoseconds. To convert clocks to nanoseconds one need to know GPU timer frequency and use the following formula:
@@ -292,6 +293,15 @@ gpuTimestampNs = gpuTimestampClocks * NS_IN_SEC / gpuTimerFrequency
 ```
 Starting from version 1.1, Level Zero provides this value as `timerResolution` field of `ze_device_properties_t` structure in cycles per second. Also it can be retrieved with the help of Intel(R) Metrics Discovery Application Programming Interface as part of device information as `GpuTimestampFrequency` symbol (look into "Device Information" section from [here](./MetricsDiscoveryAPI.md) for details).
 
+Also note that not all bits in `global` kernel timestamp value may be valid, to get exact number of valid bits use `timestampValidBits` field from `ze_device_properties_t` structure.
+
+#### OpenCL(TM) Kernels
+Common stragety of metrics to kernel mapping for OpenCL(TM) kernels may be the following:
+1. Collect kernel timestamps based on [OpenCL(TM) device activity tracing](../device_activity_tracing/OpenCL.md) mechanism;
+2. Convert device timestamps into host timestamps with the help of `clGetDeviceAndHostTimer` function (Time Correlation section [here](../device_activity_tracing/OpenCL.md));
+3. Convert host timestamps into Level Zero kernel timestamps with the help of `zeDeviceGetGlobalTimestamps` function (Time Correlation section [here](../device_activity_tracing/LevelZero.md));
+4. Use the approach described for Level Zero kernels (above).
+
 ## Build and Run
 To make metrics collection work one need to link the application with Level Zero ICD library (e.g. `libze_loader.so`) and run it as following:
 ```

diff --git a/chapters/metrics_collection/MetricsDiscoveryAPI.md b/chapters/metrics_collection/MetricsDiscoveryAPI.md
@@ -144,7 +144,7 @@ for (uint32_t gid = 0; gid < device->GetParams()->ConcurrentGroupsCount; ++gid)
 }
 ```
 
-### Collection
+### Continuous Collection
 Process of metrics collection with Intel(R) Metrics Discovery Application Programming Interface assumes that there is an infinite loop in a seprate thread, where one asks for collected samples periodically, read the data for a chunk of samples and store them into some memory or file (one sample contains all the metics and information items from a metric set).
 
 First, one should set sampling interval for collection (in nanoseconds) and determine the size of the buffer with raw results (in bytes).
@@ -231,57 +231,69 @@ assert(status == md::CC_OK);
 calculated_reports.resize(calculated_report_count * calculated_report_size);
 ```
 
-### Time Correlation With OpenCL(TM)
+### Time Correlation
 It's often needed to map collected hardware metrics to a kernel in terms of time intervals.
 
 Each metric set contains a special *information* item called `QueryBeginTime` that represents a timestamp (in nanoseconds) for a sample. At the same time one can collect kernel execution intervals using ***Device Activity Tracing*** capabilities. So to map exact sample to the kernel invocation, one need just to check if sample timestamp is in between of kernel start and end timestamps.
 
-The problem is that metrics timestamp one can get with Intel(R) Metrics Discovery Application Programming Interface and kernel timestamps one can get e.g. with OpenCL(TM) are different and can't be compared directly - so one has to convert them to a single time format first.
+The problem is that metrics timestamp one can get with Intel(R) Metrics Discovery Application Programming Interface and kernel timestamps one can get e.g. with OpenCL(TM) or with oneAPI Level Zero (Level Zero) are different and can't be compared directly - so one has to convert them to a single time format first.
 
 In Intel(R) Metrics Discovery Application Programming Interface library there is a function `GetGpuCpuTimestamps` that allows to bind GPU metrics timestamp to some CPU timestamp (which is based on `CLOCK_MONOTONIC` on Linux and `QueryPerformanceCounter` on Windows).
 
-So e.g. to convert GPU metrics timestamp (`gpuTimestamp`) to OpenCL GPU timestamp (`cpu_timestamp`), which is based on `CLOCK_MONOTONIC_RAW` on Linux, one should perform the following steps:
-1. Get "time snap point" to correlate GPU and `CLOCK_MONOTONIC` time:
+So the common strategy of metrics to kernels mapping is the following:
+1. Convert `QueryBeginTime` into CPU timestamp with the help of `GetGpuCpuTimestamps` of Intel(R) Metrics Discovery Application Programming Interface library;
+2. Convert kernel timestamp into host timestamp:
+    - for OpenCL(TM) - with the help of `clGetDeviceAndHostTimer` function (Time Correlation section [here](../device_activity_tracing/OpenCL.md));
+    - for oneAPI Level Zero (Level Zero) - with the help of `zeDeviceGetGlobalTimestamps` function (Time Correlation section [here](../device_activity_tracing/LevelZero.md));
+    - on Linux one may need to convert `CLOCK_MONOTONIC_RAW` into `CLOCK_MONOTONIC` to use the same time units;
+3. Compare directly metic CPU timestamp with kernel host start and host end timestamps to perform metrics to kernel correlation.
+
+### Query-Based Collection for OpenCL(TM)
+An alternative approach could be to collect a single aggregated metric report per each kernel invocation. In some sense such a way may be easier than time-based collection, since one don't need to worry about time correlation (report is already for the kernel) and to deal with separate thread (runtime takes most of the responsibilities on data collection), but from the other hand one will get the only aggregated report per kernel (that may be not enough to analyse over time kernel behaviour). Also such approach is limited to support only specific runtimes (e.g. OpenCL(TM)).
+
+To enable query-based mertrics collection for OpenCL(TM) one should perform the following steps:
+1. Create MD device and choose target metric set (as described above);
+2. Set API filtering mode to OpenCL(TM) and activate metric set:
 ```cpp
-uint64_t cpu_snap_point = 0, gpu_snap_point = 0;
-status = device->GetGpuCpuTimestamps(
-    &gpu_snap_point, &cpu_snap_point, nullptr);
+md::TCompletionCode status = set_->SetApiFiltering(
+    md::API_TYPE_OCL | md::API_TYPE_OGL4_X);
+assert(status == md::CC_OK);
+status = set_->Activate();
 assert(status == md::CC_OK);
 ```
-2. Calculate `CLOCK_MONOTONIC` time for `gpuTimestamp`:
+3. To be able to retrieve metrics for a kernel, one need to create a specific command queue with the help of extension. The argument `configuration` here could be obtained from the target metric set. Note, that `CL_QUEUE_PROFILING_ENABLE` property is required for such a queue:
 ```cpp
-if (gpuTimestamp > gpu_snap_point) {
-  cpu_timestamp = cpu_snap_point + (gpuTimestamp - gpu_snap_point);
-} else {
-  cpu_timestamp = cpu_snap_point - (gpu_snap_point - gpuTimestamp);
-}
+cl_command_queue CL_API_CALL
+clCreatePerfCountersCommandQueueINTEL(
+    cl_context context,
+    cl_device_id device,
+    cl_command_queue_properties properties,
+    cl_uint configuration,
+    cl_int *errcodeRet);
+// ...
+cl_uint configuration = set_->GetParams()->ApiSpecificId.OCL;
 ```
-3. Convert `CLOCK_MONOTONIC` to `CLOCK_MONOTONIC_RAW`:
+4. Metric report for specific kernel could be retrieved as event profiling info:
 ```cpp
-uint64_t ConvertClockMonotonicToRaw(uint64_t clock_monotonic) {
-  timespec monotonic_time;
-  timespec raw_time;
-  int status = 0;
-
-  status = clock_gettime(CLOCK_MONOTONIC, &monotonic_time);
-  assert(status == 0);
-  status = clock_gettime(CLOCK_MONOTONIC_RAW, &raw_time);
-  assert(status == 0);
-
-  uint64_t raw = raw_time.tv_nsec + NSEC_IN_SEC * raw_time.tv_sec;
-  uint64_t monotonic = monotonic_time.tv_nsec +
-    NSEC_IN_SEC * monotonic_time.tv_sec;
-  if (raw > monotonic) {
-      return clock_monotonic + (raw - monotonic);
-  } else {
-      return clock_monotonic - (monotonic - raw);
-  }
-}
-// ...
+#define CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL 0x407F
+
+size_t report_size = set_->GetParams()->QueryReportSize;
+PTI_ASSERT(report_size > 0);
 
-cpu_timestamp = ConvertClockMonotonicToRaw(cpu_timestamp);
+std::vector<uint8_t> report(report_size, 0);
+size_t output_size = 0;
+cl_int status = clGetEventProfilingInfo(
+    event, CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL,
+    report_size, report.data(), &output_size);
+assert(status == CL_SUCCESS);
+```
+5. Report data grabbed from `clGetEventProfilingInfo` should be calculated into metrics (described above). There will be a single metric report.
+6. To finalize data collection one should deactive target metric set and remove MD device:
+```cpp
+md::TCompletionCode status = set_->Deactivate();
+assert(status == md::CC_OK);
 ```
-After that one can directly compare this `cpu_timestamp` with kernel start and end timestamps to perform metrics to kernel correlation.
+Query-based metrics collection for Level Zero is described [here](./LevelZero.md).
 
 ## Build and Run
 Since Intel(R) Metrics Discovery Application Programming Interface library is loaded dynamically at runtime, there is no need in any special build/run options. Just make sure Intel(R) Metrics Discovery Application Programming Interface library can be found correctly:
@@ -296,6 +308,7 @@ LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_libmd.so> ./<application>
 
 ## Samples
 - [GPU Metrics for OpenCL(TM)](../../samples/cl_gpu_metrics)
+- [GPU Query for OpenCL(TM)](../../samples/cl_gpu_query)
 
 ## Tools
 - [GPU Info](../../tools/gpuinfo)
diff --git a/samples/cl_debug_info/cl_debug_info_collector.h b/samples/cl_debug_info/cl_debug_info_collector.h
@@ -59,8 +59,8 @@ class ClDebugInfoCollector {
         "for target device" << std::endl;
       if (tracer != nullptr) {
         delete tracer;
-        delete collector;
       }
+      delete collector;
       return nullptr;
     }
 

diff --git a/samples/cl_gpu_metrics/CMakeLists.txt b/samples/cl_gpu_metrics/CMakeLists.txt
@@ -8,6 +8,7 @@ SetBuildType()
 
 # Tool Library
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPTI_KERNEL_INTERVALS=1")
 add_library(clt_gpu_metrics SHARED
   "${PROJECT_SOURCE_DIR}/../../loader/init.cc"
   tool.cc)

diff --git a/samples/cl_gpu_metrics/cl_metric_collector.h b/samples/cl_gpu_metrics/cl_metric_collector.h
@@ -25,9 +25,7 @@ enum CollectorState {
 
 class ClMetricCollector {
  public:
-  static ClMetricCollector* Create(
-      cl_device_id device, const char* set_name) {
-    PTI_ASSERT(device != nullptr);
+  static ClMetricCollector* Create(const char* set_name) {
     PTI_ASSERT(set_name != nullptr);
 
     std::string device_string = utils::GetEnv("PTI_DEVICE_ID");

diff --git a/samples/cl_gpu_metrics/tool.cc b/samples/cl_gpu_metrics/tool.cc
@@ -239,7 +239,7 @@ void EnableProfiling() {
     return;
   }
 
-  metric_collector = ClMetricCollector::Create(device, "ComputeBasic");
+  metric_collector = ClMetricCollector::Create("ComputeBasic");
   if (metric_collector == nullptr) {
     kernel_collector->DisableTracing();
     delete kernel_collector;

diff --git a/samples/cl_gpu_query/CMakeLists.txt b/samples/cl_gpu_query/CMakeLists.txt
@@ -0,0 +1,39 @@
+include("../../build_utils/CMakeLists.txt")
+SetRequiredCMakeVersion()
+cmake_minimum_required(VERSION ${REQUIRED_CMAKE_VERSION})
+
+project(PTI_Samples_OpenCL_GPU_Query CXX)
+SetCompilerFlags()
+SetBuildType()
+
+# Tool Library
+
+add_library(clt_gpu_query SHARED
+  "${PROJECT_SOURCE_DIR}/../../loader/init.cc"
+  "${PROJECT_SOURCE_DIR}/../../utils/trace_guard.cc"
+  tool.cc)
+target_include_directories(clt_gpu_query
+  PRIVATE "${PROJECT_SOURCE_DIR}/../../utils")
+if(CMAKE_INCLUDE_PATH)
+  target_include_directories(clt_gpu_query
+    PUBLIC "${CMAKE_INCLUDE_PATH}")
+endif()
+
+FindOpenCLLibrary(clt_gpu_query)
+FindOpenCLHeaders(clt_gpu_query)
+
+GetOpenCLTracingHeaders(clt_gpu_query)
+
+GetMDHeaders(clt_gpu_query)
+CheckForMDLibrary(clt_gpu_query)
+
+# Loader
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTOOL_NAME=clt_gpu_query")
+add_executable(cl_gpu_query "${PROJECT_SOURCE_DIR}/../../loader/loader.cc")
+target_include_directories(cl_gpu_query
+  PRIVATE "${PROJECT_SOURCE_DIR}/../../utils")
+if(UNIX)
+  target_link_libraries(cl_gpu_query
+    dl)
+endif()