From d8cdce07f96a1828e83850b0996938d323a55b99 Mon Sep 17 00:00:00 2001
From: 1274085042 <a1274085042@outlook.com>
Date: Wed, 11 Mar 2026 19:47:31 +0800
Subject: [PATCH] Add HIP/ROCm support to cuda_kernel_launch_stats

Add support for HIP runtime events (hipLaunchKernel, hipLaunchKernelExC,
hipMemcpyAsync, hipMemsetAsync) in cuda_kernel_launch_stats method, enabling
kernel launch statistics analysis for ROCm/HIP traces.

Changes:
- Add HIP symbol lookups alongside existing CUDA symbols
- Include hipLaunchKernel/hipLaunchKernelExC in launch_ids
- Include hipMemcpyAsync/hipMemsetAsync in memory event filtering
- Use isin() for memory event filtering (cleaner, handles None safely)
- Add comprehensive unit tests (9 test cases) covering:
  - HIP launch kernel events
  - HIP memory events (include/exclude)
  - Mixed CUDA+HIP traces
  - Pure ROCm traces (no CUDA symbols)
  - Negative launch delay clipping
  - Multiple ranks
  - Empty trace handling"'
---
 hta/analyzers/cuda_kernel_analysis.py |  18 +-
 tests/test_hip_kernel_launch_stats.py | 350 ++++++++++++++++++++++++++
 2 files changed, 364 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_hip_kernel_launch_stats.py

diff --git a/hta/analyzers/cuda_kernel_analysis.py b/hta/analyzers/cuda_kernel_analysis.py
index 4fb36db..450d9e1 100644
--- a/hta/analyzers/cuda_kernel_analysis.py
+++ b/hta/analyzers/cuda_kernel_analysis.py
@@ -559,29 +559,39 @@ def cuda_kernel_launch_stats(
 
             # filter out events which have correlation value matching to
             # cudaLaunchKernel, cudaLaunchKernelExC, cudaMemcpyAsync, cudaMemsetAsync
+            # Also support HIP equivalents for ROCm traces
             cuda_launch_kernel_id = sym_index.get("cudaLaunchKernel", None)
             cuda_launch_kernel_ex_c_id = sym_index.get("cudaLaunchKernelExC", None)
             cuda_memcpy_async_id = sym_index.get("cudaMemcpyAsync", None)
             cuda_memset_async_id = sym_index.get("cudaMemsetAsync", None)
+            hip_launch_kernel_id = sym_index.get("hipLaunchKernel", None)
+            hip_launch_kernel_ex_c_id = sym_index.get("hipLaunchKernelExC", None)
+            hip_memcpy_async_id = sym_index.get("hipMemcpyAsync", None)
+            hip_memset_async_id = sym_index.get("hipMemsetAsync", None)
             mtia_launch_kernel_id = sym_index.get(
                 "runFunction - job_prep_and_submit_for_execution", None
             )
 
-            # get correlation id's of cudaLaunchKernel events
+            # get correlation id's of cudaLaunchKernel/hipLaunchKernel events
             launch_ids = [
                 cuda_launch_kernel_id,
                 cuda_launch_kernel_ex_c_id,
+                hip_launch_kernel_id,
+                hip_launch_kernel_ex_c_id,
                 mtia_launch_kernel_id,
             ]
             cuda_launch_kernel_correlation_series: pd.Series = trace_df[
                 trace_df["name"].isin(launch_ids)
             ].correlation
 
-            # whether to use memory events - cudaMemsetAsync and cudaMemcpyAsync.
+            # whether to use memory events - cudaMemsetAsync/hipMemsetAsync and cudaMemcpyAsync/hipMemcpyAsync.
             if include_memory_events:
+                memory_event_ids = [
+                    i for i in [cuda_memset_async_id, cuda_memcpy_async_id,
+                                hip_memset_async_id, hip_memcpy_async_id] if i is not None
+                ]
                 memory_event_correlation_series: pd.Series = trace_df[
-                    (trace_df["name"] == cuda_memset_async_id)
-                    | (trace_df["name"] == cuda_memcpy_async_id)
+                    trace_df["name"].isin(memory_event_ids)
                 ].correlation
                 merged_series: pd.Series = pd.concat(
                     [
diff --git a/tests/test_hip_kernel_launch_stats.py b/tests/test_hip_kernel_launch_stats.py
new file mode 100644
index 0000000..ad24576
--- /dev/null
+++ b/tests/test_hip_kernel_launch_stats.py
@@ -0,0 +1,350 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Unit tests for HIP/ROCm support in cuda_kernel_launch_stats.
+
+These tests verify that the `cuda_kernel_launch_stats` method in
+`CudaKernelAnalysis` correctly handles HIP runtime events
+(hipLaunchKernel, hipLaunchKernelExC, hipMemcpyAsync, hipMemsetAsync)
+in addition to the existing CUDA and MTIA events.
+"""
+
+import unittest
+from unittest.mock import MagicMock, PropertyMock
+
+import pandas as pd
+
+from hta.analyzers.cuda_kernel_analysis import CudaKernelAnalysis
+
+
+def _create_mock_trace(sym_index: dict, trace_df: pd.DataFrame) -> MagicMock:
+    """Helper to create a mock Trace object with the given symbol index and trace DataFrame."""
+    mock_trace = MagicMock()
+    mock_symbol_table = MagicMock()
+    mock_symbol_table.get_sym_id_map.return_value = sym_index
+    type(mock_trace).symbol_table = PropertyMock(return_value=mock_symbol_table)
+    mock_trace.get_trace.return_value = trace_df
+    return mock_trace
+
+
+class TestHipKernelLaunchStats(unittest.TestCase):
+    """Tests for HIP/ROCm support in CudaKernelAnalysis.cuda_kernel_launch_stats."""
+
+    def test_hip_launch_kernel_events_are_captured(self):
+        """Verify hipLaunchKernel events are included in the launch stats."""
+        # Symbol IDs
+        sym_index = {
+            "hipLaunchKernel": 100,
+        }
+
+        # Create trace DataFrame: CPU runtime event + corresponding GPU kernel
+        trace_df = pd.DataFrame(
+            {
+                "name": [100, 200],
+                "stream": [-1, 1],
+                "correlation": [1001, 1001],
+                "dur": [10, 50],
+                "ts": [100, 120],
+            }
+        )
+
+        mock_trace = _create_mock_trace(sym_index, trace_df)
+        result = CudaKernelAnalysis.cuda_kernel_launch_stats(
+            mock_trace, ranks=[0], visualize=False
+        )
+        df = result[0]
+
+        self.assertEqual(len(df), 1)
+        self.assertEqual(df.iloc[0]["cpu_duration"], 10)
+        self.assertEqual(df.iloc[0]["gpu_duration"], 50)
+        # launch_delay = gpu_ts - (cpu_ts + cpu_dur) = 120 - (100 + 10) = 10
+        self.assertEqual(df.iloc[0]["launch_delay"], 10)
+
+    def test_hip_launch_kernel_ex_c_events_are_captured(self):
+        """Verify hipLaunchKernelExC events are included in the launch stats."""
+        sym_index = {
+            "hipLaunchKernelExC": 101,
+        }
+
+        trace_df = pd.DataFrame(
+            {
+                "name": [101, 201],
+                "stream": [-1, 2],
+                "correlation": [2001, 2001],
+                "dur": [15, 80],
+                "ts": [200, 230],
+            }
+        )
+
+        mock_trace = _create_mock_trace(sym_index, trace_df)
+        result = CudaKernelAnalysis.cuda_kernel_launch_stats(
+            mock_trace, ranks=[0], visualize=False
+        )
+        df = result[0]
+
+        self.assertEqual(len(df), 1)
+        self.assertEqual(df.iloc[0]["cpu_duration"], 15)
+        self.assertEqual(df.iloc[0]["gpu_duration"], 80)
+        # launch_delay = 230 - (200 + 15) = 15
+        self.assertEqual(df.iloc[0]["launch_delay"], 15)
+
+    def test_hip_memory_events_included(self):
+        """Verify hipMemcpyAsync and hipMemsetAsync are included when include_memory_events=True."""
+        sym_index = {
+            "hipMemcpyAsync": 102,
+            "hipMemsetAsync": 103,
+        }
+
+        trace_df = pd.DataFrame(
+            {
+                "name": [102, 103, 202, 203],
+                "stream": [-1, -1, 3, 4],
+                "correlation": [3001, 3002, 3001, 3002],
+                "dur": [5, 8, 20, 30],
+                "ts": [300, 400, 310, 415],
+            }
+        )
+
+        mock_trace = _create_mock_trace(sym_index, trace_df)
+        result = CudaKernelAnalysis.cuda_kernel_launch_stats(
+            mock_trace, ranks=[0], include_memory_events=True, visualize=False
+        )
+        df = result[0]
+
+        self.assertEqual(len(df), 2)
+        # hipMemcpyAsync event
+        memcpy_row = df[df["correlation"] == 3001].iloc[0]
+        self.assertEqual(memcpy_row["cpu_duration"], 5)
+        self.assertEqual(memcpy_row["gpu_duration"], 20)
+        # launch_delay = 310 - (300 + 5) = 5
+        self.assertEqual(memcpy_row["launch_delay"], 5)
+
+        # hipMemsetAsync event
+        memset_row = df[df["correlation"] == 3002].iloc[0]
+        self.assertEqual(memset_row["cpu_duration"], 8)
+        self.assertEqual(memset_row["gpu_duration"], 30)
+        # launch_delay = 415 - (400 + 8) = 7
+        self.assertEqual(memset_row["launch_delay"], 7)
+
+    def test_hip_memory_events_excluded(self):
+        """Verify hipMemcpyAsync/hipMemsetAsync are excluded when include_memory_events=False."""
+        sym_index = {
+            "hipLaunchKernel": 100,
+            "hipMemcpyAsync": 102,
+            "hipMemsetAsync": 103,
+        }
+
+        trace_df = pd.DataFrame(
+            {
+                "name": [100, 102, 103, 200, 202, 203],
+                "stream": [-1, -1, -1, 1, 3, 4],
+                "correlation": [1001, 3001, 3002, 1001, 3001, 3002],
+                "dur": [10, 5, 8, 50, 20, 30],
+                "ts": [100, 300, 400, 120, 310, 415],
+            }
+        )
+
+        mock_trace = _create_mock_trace(sym_index, trace_df)
+        result = CudaKernelAnalysis.cuda_kernel_launch_stats(
+            mock_trace, ranks=[0], include_memory_events=False, visualize=False
+        )
+        df = result[0]
+
+        # Only hipLaunchKernel should be captured, not the memory events
+        self.assertEqual(len(df), 1)
+        self.assertEqual(df.iloc[0]["correlation"], 1001)
+
+    def test_mixed_cuda_and_hip_events(self):
+        """Verify both CUDA and HIP events are captured together in a mixed trace."""
+        sym_index = {
+            "cudaLaunchKernel": 10,
+            "hipLaunchKernel": 100,
+            "cudaMemcpyAsync": 11,
+            "hipMemcpyAsync": 102,
+        }
+
+        trace_df = pd.DataFrame(
+            {
+                "name": [10, 100, 11, 102, 210, 2100, 211, 2102],
+                "stream": [-1, -1, -1, -1, 1, 2, 3, 4],
+                "correlation": [
+                    1001,
+                    2001,
+                    1002,
+                    2002,
+                    1001,
+                    2001,
+                    1002,
+                    2002,
+                ],
+                "dur": [10, 12, 5, 6, 50, 60, 20, 25],
+                "ts": [100, 200, 300, 400, 120, 220, 310, 410],
+            }
+        )
+
+        mock_trace = _create_mock_trace(sym_index, trace_df)
+        result = CudaKernelAnalysis.cuda_kernel_launch_stats(
+            mock_trace, ranks=[0], include_memory_events=True, visualize=False
+        )
+        df = result[0]
+
+        # All 4 events should be captured
+        self.assertEqual(len(df), 4)
+        correlations = sorted(df["correlation"].tolist())
+        self.assertEqual(correlations, [1001, 1002, 2001, 2002])
+
+    def test_negative_launch_delay_clipped_to_zero(self):
+        """Verify that negative launch delays are clipped to 0 for HIP events."""
+        sym_index = {
+            "hipLaunchKernel": 100,
+        }
+
+        # GPU kernel starts before CPU op ends (overlap)
+        trace_df = pd.DataFrame(
+            {
+                "name": [100, 200],
+                "stream": [-1, 1],
+                "correlation": [1001, 1001],
+                "dur": [20, 50],
+                "ts": [100, 105],  # gpu starts at 105, cpu ends at 120
+            }
+        )
+
+        mock_trace = _create_mock_trace(sym_index, trace_df)
+        result = CudaKernelAnalysis.cuda_kernel_launch_stats(
+            mock_trace, ranks=[0], visualize=False
+        )
+        df = result[0]
+
+        # launch_delay = 105 - (100 + 20) = -15, should be clipped to 0
+        self.assertEqual(df.iloc[0]["launch_delay"], 0)
+
+    def test_hip_only_no_cuda_symbols(self):
+        """Verify the function works when only HIP symbols exist (pure ROCm trace)."""
+        sym_index = {
+            "hipLaunchKernel": 100,
+            "hipLaunchKernelExC": 101,
+            "hipMemcpyAsync": 102,
+            "hipMemsetAsync": 103,
+        }
+
+        trace_df = pd.DataFrame(
+            {
+                "name": [100, 101, 102, 103, 200, 201, 202, 203],
+                "stream": [-1, -1, -1, -1, 1, 2, 3, 4],
+                "correlation": [
+                    1001,
+                    1002,
+                    1003,
+                    1004,
+                    1001,
+                    1002,
+                    1003,
+                    1004,
+                ],
+                "dur": [10, 12, 5, 8, 50, 60, 20, 30],
+                "ts": [100, 200, 300, 400, 115, 220, 310, 415],
+            }
+        )
+
+        mock_trace = _create_mock_trace(sym_index, trace_df)
+        result = CudaKernelAnalysis.cuda_kernel_launch_stats(
+            mock_trace, ranks=[0], include_memory_events=True, visualize=False
+        )
+        df = result[0]
+
+        self.assertEqual(len(df), 4)
+
+        # Verify each event
+        row_1001 = df[df["correlation"] == 1001].iloc[0]
+        self.assertEqual(row_1001["cpu_duration"], 10)
+        self.assertEqual(row_1001["gpu_duration"], 50)
+        self.assertEqual(row_1001["launch_delay"], 5)  # 115 - 110
+
+        row_1002 = df[df["correlation"] == 1002].iloc[0]
+        self.assertEqual(row_1002["cpu_duration"], 12)
+        self.assertEqual(row_1002["gpu_duration"], 60)
+        self.assertEqual(row_1002["launch_delay"], 8)  # 220 - 212
+
+        row_1003 = df[df["correlation"] == 1003].iloc[0]
+        self.assertEqual(row_1003["cpu_duration"], 5)
+        self.assertEqual(row_1003["gpu_duration"], 20)
+        self.assertEqual(row_1003["launch_delay"], 5)  # 310 - 305
+
+        row_1004 = df[df["correlation"] == 1004].iloc[0]
+        self.assertEqual(row_1004["cpu_duration"], 8)
+        self.assertEqual(row_1004["gpu_duration"], 30)
+        self.assertEqual(row_1004["launch_delay"], 7)  # 415 - 408
+
+    def test_empty_trace_returns_empty_result(self):
+        """Verify behavior with a trace that has no matching launch events."""
+        sym_index = {
+            "hipLaunchKernel": 100,
+        }
+
+        # Trace with no events matching hipLaunchKernel
+        trace_df = pd.DataFrame(
+            {
+                "name": [999, 998],
+                "stream": [-1, 1],
+                "correlation": [1001, 1001],
+                "dur": [10, 50],
+                "ts": [100, 120],
+            }
+        )
+
+        mock_trace = _create_mock_trace(sym_index, trace_df)
+        result = CudaKernelAnalysis.cuda_kernel_launch_stats(
+            mock_trace, ranks=[0], visualize=False
+        )
+        df = result[0]
+
+        self.assertEqual(len(df), 0)
+
+    def test_multiple_ranks_with_hip_events(self):
+        """Verify HIP events are processed correctly for multiple ranks."""
+        sym_index = {
+            "hipLaunchKernel": 100,
+        }
+
+        trace_df_rank0 = pd.DataFrame(
+            {
+                "name": [100, 200],
+                "stream": [-1, 1],
+                "correlation": [1001, 1001],
+                "dur": [10, 50],
+                "ts": [100, 120],
+            }
+        )
+
+        trace_df_rank1 = pd.DataFrame(
+            {
+                "name": [100, 200],
+                "stream": [-1, 1],
+                "correlation": [2001, 2001],
+                "dur": [15, 70],
+                "ts": [200, 225],
+            }
+        )
+
+        mock_trace = _create_mock_trace(sym_index, trace_df_rank0)
+        mock_trace.get_trace.side_effect = lambda rank: (
+            trace_df_rank0 if rank == 0 else trace_df_rank1
+        )
+
+        result = CudaKernelAnalysis.cuda_kernel_launch_stats(
+            mock_trace, ranks=[0, 1], visualize=False
+        )
+
+        self.assertIn(0, result)
+        self.assertIn(1, result)
+        self.assertEqual(len(result[0]), 1)
+        self.assertEqual(len(result[1]), 1)
+        self.assertEqual(result[0].iloc[0]["gpu_duration"], 50)
+        self.assertEqual(result[1].iloc[0]["gpu_duration"], 70)
+
+
+if __name__ == "__main__":
+    unittest.main()