From 9ffc07d4425452de0bfe737a33b15ba4f78a681b Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 16:28:24 -0400 Subject: [PATCH 01/12] updated trace collection documentation --- README.md | 27 +++++++++++++++++++++++++- docs/source/intro/trace_collection.rst | 13 ++++++++----- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 001ab3ef..0f51a1c6 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,31 @@ Learn more about the features and the API from our [documentation](https://hta.r ### Data Preparation All traces collected from a job must reside in a unique folder. +An example of trace collection using the PyTorch Profiler is shown below: + +```python +from torch.profiler import profile, schedule, tensorboard_trace_handler + +tracing_schedule = schedule(skip_first=5, wait=5, warmup=2, active=2, repeat=1) +trace_handler = tensorboard_trace_handler(dir_name="traces", use_gzip=True) + +NUM_EPOCHS = 10 # arbitrary number of epochs to profile + +with profile( + activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA], + schedule = tracing_schedule, + on_trace_ready = trace_handler, + profile_memory = True, + record_shapes = True, + with_stack = True +) as prof: + + for _ in EPOCHS: + for step, batch_data in enumerate(data_loader): + train(batch_data) + prof.step() +``` + ### Analysis in a Jupyter notebook Activate the Conda environment and launch a Jupyter notebook. @@ -94,7 +119,7 @@ jupyter notebook Import HTA, and create a `TraceAnalysis` object ``` python from hta.trace_analysis import TraceAnalysis -analyzer = TraceAnalysis(trace_dir = "/path/to/folder/containing/the/traces") +analyzer = TraceAnalysis(trace_dir = "traces") # path to the trace folder ``` #### Basic Usage diff --git a/docs/source/intro/trace_collection.rst b/docs/source/intro/trace_collection.rst index ca8a04d0..38db9942 100644 --- a/docs/source/intro/trace_collection.rst +++ b/docs/source/intro/trace_collection.rst @@ -23,12 +23,14 @@ To profile, wrap the code in the ``profile`` context manager as shown below. .. code-block:: python :linenos: - :emphasize-lines: 17 + :emphasize-lines: 19 from torch.profiler import profile, schedule, tensorboard_trace_handler tracing_schedule = schedule(skip_first=5, wait=5, warmup=2, active=2, repeat=1) - trace_handler = tensorboard_trace_handler(dir_name=/output/folder, use_gzip=True) + trace_handler = tensorboard_trace_handler(dir_name="traces", use_gzip=True) + + NUM_EPOCHS = 5 # arbitrary number of epochs to profile with profile( activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA], @@ -39,9 +41,10 @@ To profile, wrap the code in the ``profile`` context manager as shown below. with_stack = True ) as prof: - for step, batch_data in enumerate(data_loader): - train(batch_data) - prof.step() + for _ in range(NUM_EPOCHS): + for step, batch_data in enumerate(data_loader): + train(batch_data) + prof.step() Line 17 in the code snippet above signals to the profiler that a training iteration has completed. From edb517a19fada9a17b6fa5213d93730b80696883 Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 16:34:17 -0400 Subject: [PATCH 02/12] convert unspecified rank from logger.error to ValueError --- hta/common/trace_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hta/common/trace_file.py b/hta/common/trace_file.py index 6244ee44..8e033580 100644 --- a/hta/common/trace_file.py +++ b/hta/common/trace_file.py @@ -58,7 +58,7 @@ def create_rank_to_trace_dict(trace_dir: str) -> Tuple[bool, Dict]: ) rank_to_trace_dict[int(rank)] = file_path else: - logger.error( + raise ValueError( "If the trace file does not have the rank specified in it, then add the following snippet " 'key to the json files to use HTA; "distributedInfo": {"rank": 0}. If there are multiple ' "traces files, then each file should have a unique rank value." From 71784d1beec53cc0fe73985a8f6b70fef5fbdb8a Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 17:52:41 -0400 Subject: [PATCH 03/12] fixed minor spelling mistake --- examples/cupti_profiler_demo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cupti_profiler_demo.py b/examples/cupti_profiler_demo.py index 1a5946d3..d9462fa5 100644 --- a/examples/cupti_profiler_demo.py +++ b/examples/cupti_profiler_demo.py @@ -3,7 +3,7 @@ # LICENSE file in the root directory of this source tree. """ -Measuring CUPTI performanc metrics using CUPTI Profiler. +Measuring CUPTI performance metrics using CUPTI Profiler. This is supported on V100 and higher NVIDIA GPUs. """ From 083930c4bfa9d9467b33c01dc189009036ce6572 Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 18:19:31 -0400 Subject: [PATCH 04/12] Added more examples --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0f51a1c6..31105890 100644 --- a/README.md +++ b/README.md @@ -88,8 +88,8 @@ An example of trace collection using the PyTorch Profiler is shown below: ```python from torch.profiler import profile, schedule, tensorboard_trace_handler -tracing_schedule = schedule(skip_first=5, wait=5, warmup=2, active=2, repeat=1) -trace_handler = tensorboard_trace_handler(dir_name="traces", use_gzip=True) +tracing_schedule = schedule(skip_first = 5, wait = 5, warmup = 2, active = 2, repeat = 1) +trace_handler = tensorboard_trace_handler(dir_name = "traces/", use_gzip = True) NUM_EPOCHS = 10 # arbitrary number of epochs to profile @@ -102,7 +102,7 @@ with profile( with_stack = True ) as prof: - for _ in EPOCHS: + for _ in range(EPOCHS): for step, batch_data in enumerate(data_loader): train(batch_data) prof.step() @@ -119,7 +119,11 @@ jupyter notebook Import HTA, and create a `TraceAnalysis` object ``` python from hta.trace_analysis import TraceAnalysis -analyzer = TraceAnalysis(trace_dir = "traces") # path to the trace folder +analyzer = TraceAnalysis(trace_dir = "traces/") # path to the trace folder + +# or + +analyzer = TraceAnalysis(trace_files={0: 'trace_0.json', 1: 'trace_1.json.gz'}) ``` #### Basic Usage From 78e0c946aa0fb343f3339db28659e63346a4e8e7 Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 18:20:19 -0400 Subject: [PATCH 05/12] updated code indentation and clarity for cupti sample --- .../features/cupti_counter_analysis.rst | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/source/features/cupti_counter_analysis.rst b/docs/source/features/cupti_counter_analysis.rst index 168a8359..4a1edbda 100644 --- a/docs/source/features/cupti_counter_analysis.rst +++ b/docs/source/features/cupti_counter_analysis.rst @@ -22,18 +22,21 @@ an example. .. code-block:: python with torch.profiler.profile( - activities=[torch.profiler.ProfilerActivity.CUDA, - torch.profiler.ProfilerActivity.CPU], - record_shapes=True, - on_trace_ready=trace_handler, - experimental_config=torch.profiler._ExperimentalConfig( - profiler_metrics=[ + activities = [ + torch.profiler.ProfilerActivity.CUDA, + torch.profiler.ProfilerActivity.CPU + ], + record_shapes = True, + on_trace_ready = trace_handler, + experimental_config = torch.profiler._ExperimentalConfig( + profiler_metrics = [ "kineto__tensor_core_insts", "dram__bytes_read.sum", - "dram__bytes_write.sum"], - profiler_measure_per_kernel=True), + "dram__bytes_write.sum" + ], + profiler_measure_per_kernel = True), ) as prof: - res = train_batch(modeldef) + res = train_batch(model) prof.step() The generated trace contains the following additional information: @@ -55,7 +58,8 @@ The code below runs CUPTI counter analysis on the collected trace. .. code-block:: python - analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + analyzer = TraceAnalysis(trace_dir = "traces/") + gpu_kernels = analyzer.get_cupti_counter_data_with_operators(ranks=[0])[0] It returns a list of dataframes, one per rank or trace file. Each dataframe From 8cc9139471f4e7a215cc7af1df221c9444b10aa4 Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 18:20:50 -0400 Subject: [PATCH 06/12] fix: spelling --- docs/source/features/trace_diff.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/features/trace_diff.rst b/docs/source/features/trace_diff.rst index 6ed03e8e..255cb78c 100644 --- a/docs/source/features/trace_diff.rst +++ b/docs/source/features/trace_diff.rst @@ -53,7 +53,7 @@ follows: .. code-block:: python df = compare_traces_output.sort_values(by="diff_duration", ascending=False) - # The duration differerence can be overshadowed by the "ProfilerStep", + # The duration difference can be overshadowed by the "ProfilerStep", # so we can filter it out to show the trend of other operators. df = df.loc[~df.index.str.startswith("ProfilerStep")].head(10) TraceDiff.visualize_duration_diff(df) From d57f6aa2ca72d4ad60f11601dedfc18ef403720c Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 18:21:12 -0400 Subject: [PATCH 07/12] added spacing and fixed line number --- docs/source/intro/trace_collection.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/intro/trace_collection.rst b/docs/source/intro/trace_collection.rst index 38db9942..5b553954 100644 --- a/docs/source/intro/trace_collection.rst +++ b/docs/source/intro/trace_collection.rst @@ -27,7 +27,7 @@ To profile, wrap the code in the ``profile`` context manager as shown below. from torch.profiler import profile, schedule, tensorboard_trace_handler - tracing_schedule = schedule(skip_first=5, wait=5, warmup=2, active=2, repeat=1) + tracing_schedule = schedule(skip_first = 5, wait = 5, warmup = 2, active = 2, repeat = 1) trace_handler = tensorboard_trace_handler(dir_name="traces", use_gzip=True) NUM_EPOCHS = 5 # arbitrary number of epochs to profile @@ -46,5 +46,5 @@ To profile, wrap the code in the ``profile`` context manager as shown below. train(batch_data) prof.step() -Line 17 in the code snippet above signals to the profiler that a training +Line 19 in the code snippet above signals to the profiler that a training iteration has completed. From 74499371eb37fd1d56b85e69790fd87cb730b8f0 Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 18:21:43 -0400 Subject: [PATCH 08/12] added `cd` command to docs --- docs/source/intro/installation.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/intro/installation.rst b/docs/source/intro/installation.rst index 13929bba..5046bac3 100644 --- a/docs/source/intro/installation.rst +++ b/docs/source/intro/installation.rst @@ -32,7 +32,10 @@ Install from source .. code-block:: # get the source code - git clone https://github.com/facebookresearch/HolisticTraceAnalysis.git + git clone https://github.com/facebookresearch/HolisticTraceAnalysis + + # move into the cloned directory + cd HolisticTraceAnalysis # execute the command below from the root of the repo pip install -e . From 12ab96535927dce9a2dd5707b1040c7085f0ba89 Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 18:21:57 -0400 Subject: [PATCH 09/12] changed traces example --- docs/source/features/augmented_counters.rst | 2 +- docs/source/features/comm_comp_overlap.rst | 2 +- docs/source/features/cuda_kernel_launch_stats.rst | 3 ++- docs/source/features/frequent_cuda_kernels.rst | 3 ++- docs/source/features/idle_time_breakdown.rst | 2 +- docs/source/features/kernel_breakdown.rst | 3 ++- docs/source/features/temporal_breakdown.rst | 3 ++- docs/source/intro/using_hta.rst | 9 ++++----- 8 files changed, 15 insertions(+), 12 deletions(-) diff --git a/docs/source/features/augmented_counters.rst b/docs/source/features/augmented_counters.rst index 0857ae16..43ce1171 100644 --- a/docs/source/features/augmented_counters.rst +++ b/docs/source/features/augmented_counters.rst @@ -25,7 +25,7 @@ API. .. code-block:: python - analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + analyzer = TraceAnalysis(trace_dir = "traces/") analyzer.generate_trace_with_counters() A screenshot of the generated trace file with augmented counters. diff --git a/docs/source/features/comm_comp_overlap.rst b/docs/source/features/comm_comp_overlap.rst index 3c9fe5c6..da3af33e 100644 --- a/docs/source/features/comm_comp_overlap.rst +++ b/docs/source/features/comm_comp_overlap.rst @@ -19,7 +19,7 @@ Communication computation overlap can be calculated as follows: .. code-block:: python - analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + analyzer = TraceAnalysis(trace_dir = "traces/") overlap_df = analyzer.get_comm_comp_overlap() The function returns a dataframe containing the overlap percentage diff --git a/docs/source/features/cuda_kernel_launch_stats.rst b/docs/source/features/cuda_kernel_launch_stats.rst index 24253177..802a2e51 100644 --- a/docs/source/features/cuda_kernel_launch_stats.rst +++ b/docs/source/features/cuda_kernel_launch_stats.rst @@ -12,7 +12,8 @@ CPU operator ending. The kernel launch info can be generated as follows: .. code-block:: python - analyzer = TraceAnalysis(trace_dir="/path/to/trace/dir") + analyzer = TraceAnalysis(trace_dir = "traces/") + kernel_info_df = analyzer.get_cuda_kernel_launch_stats() A screenshot of the generated dataframe is given below. diff --git a/docs/source/features/frequent_cuda_kernels.rst b/docs/source/features/frequent_cuda_kernels.rst index 05045924..956e1a51 100644 --- a/docs/source/features/frequent_cuda_kernels.rst +++ b/docs/source/features/frequent_cuda_kernels.rst @@ -22,7 +22,8 @@ be the same across different ranks. .. code-block:: python - analyzer = TraceAnalysis(trace_dir = "/path/to/trace_folder") + analyzer = TraceAnalysis(trace_dir = "traces/") + cuda_sequences_df = analyzer.get_frequent_cuda_kernel_sequences( operator_name = "aten::linear", output_dir = "/tmp/" diff --git a/docs/source/features/idle_time_breakdown.rst b/docs/source/features/idle_time_breakdown.rst index 6ed6f498..3405f2db 100644 --- a/docs/source/features/idle_time_breakdown.rst +++ b/docs/source/features/idle_time_breakdown.rst @@ -33,7 +33,7 @@ function. The idle time breakdown can be generated as follows: .. code-block:: python - analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + analyzer = TraceAnalysis(trace_dir = "traces/") idle_time_df = analyzer.get_idle_time_breakdown() .. image:: ../_static/idle_time_breakdown_percentage.png diff --git a/docs/source/features/kernel_breakdown.rst b/docs/source/features/kernel_breakdown.rst index 2fa93970..a8f184b3 100644 --- a/docs/source/features/kernel_breakdown.rst +++ b/docs/source/features/kernel_breakdown.rst @@ -13,7 +13,8 @@ The kernel breakdown can be calculated as follows: .. code-block:: python - analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + analyzer = TraceAnalysis(trace_dir = "traces/") + kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown() The first dataframe returned by the function contains the raw values used to diff --git a/docs/source/features/temporal_breakdown.rst b/docs/source/features/temporal_breakdown.rst index 8c063b59..409cc1f6 100644 --- a/docs/source/features/temporal_breakdown.rst +++ b/docs/source/features/temporal_breakdown.rst @@ -25,7 +25,8 @@ The temporal breakdown can be calculated as follows: .. code-block:: python - analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + analyzer = TraceAnalysis(trace_dir = "traces/") + time_spent_df = analyzer.get_temporal_breakdown() The function returns a dataframe containing the temporal breakdown for each rank. diff --git a/docs/source/intro/using_hta.rst b/docs/source/intro/using_hta.rst index 3c7d6ef0..05fd9f53 100644 --- a/docs/source/intro/using_hta.rst +++ b/docs/source/intro/using_hta.rst @@ -13,8 +13,7 @@ Trace Analysis .. code-block:: python from hta.trace_analysis import TraceAnalysis - analyzer = TraceAnalysis(trace_dir = "/trace/folder/path") - + analyzer = TraceAnalysis(trace_dir = "traces/") Using the features is straightforward. E.g. @@ -48,9 +47,9 @@ Using the features is straightforward. E.g. cuda_kernel_launch_stats = analyzer.get_cuda_kernel_launch_stats() # Frequent CUDA kernel sequences - frequent_patterns_df = analyzer.get_frequent_cuda_kernel_sequences(operator_name="aten::linear", - output_dir="/output/trace/path" - ) + frequent_patterns_df = analyzer.get_frequent_cuda_kernel_sequences( + operator_name="aten::linear", output_dir="/output/trace/path" + ) To learn more about the features in detail we refer the reader to the **Features** section. The features can be tuned by various From 860e1ee20b74a6f56e3f435f588d9fc20bee079d Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 18:22:59 -0400 Subject: [PATCH 10/12] changed path_to_hta to PATH_TO_TRACES (the folder contains traces, not analysis) --- examples/identify_stragglers.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/identify_stragglers.ipynb b/examples/identify_stragglers.ipynb index 4bcb8c0d..30a0eb95 100644 --- a/examples/identify_stragglers.ipynb +++ b/examples/identify_stragglers.ipynb @@ -86,7 +86,7 @@ "outputs": [], "source": [ "# Set path to HolisticTraceAnalysis folder\n", - "path_to_hta = \"/path/to/HolisticTraceAnalysis\"" + "PATH_TO_TRACES = \"traces/\"" ] }, { @@ -106,7 +106,7 @@ "%%time\n", "from hta.trace_analysis import TraceAnalysis\n", "\n", - "trace_dir = path_to_hta + \"/tests/data/vision_transformer\"\n", + "trace_dir = PATH_TO_TRACES + \"/tests/data/vision_transformer\"\n", "print(trace_dir)\n", "analyzer = TraceAnalysis(trace_dir = trace_path)" ] From 45881728347b5acf559b27d83a011ee738d206be Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 18:23:33 -0400 Subject: [PATCH 11/12] formatted example to wrap --- examples/kernel_breakdown_demo.ipynb | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/kernel_breakdown_demo.ipynb b/examples/kernel_breakdown_demo.ipynb index 78c7701a..4aedbcf1 100644 --- a/examples/kernel_breakdown_demo.ipynb +++ b/examples/kernel_breakdown_demo.ipynb @@ -245,10 +245,9 @@ } ], "source": [ - "kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown( \n", - " num_kernels=5, \n", - " include_memory_kernels=True, \n", - " image_renderer=\"png\")" + "kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown(\n", + " num_kernels=5, include_memory_kernels=True, image_renderer=\"png\"\n", + ")" ] }, { From aa25d7696cceb0ad4328936d7fd2892e8ebd7878 Mon Sep 17 00:00:00 2001 From: William Kaiser Date: Thu, 28 Mar 2024 18:36:56 -0400 Subject: [PATCH 12/12] updated num_epochs --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 31105890..ec07989e 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ with profile( with_stack = True ) as prof: - for _ in range(EPOCHS): + for _ in range(NUM_EPOCHS): for step, batch_data in enumerate(data_loader): train(batch_data) prof.step()