From 9ffc07d4425452de0bfe737a33b15ba4f78a681b Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 16:28:24 -0400
Subject: [PATCH 01/12] updated trace collection documentation

---
 README.md                              | 27 +++++++++++++++++++++++++-
 docs/source/intro/trace_collection.rst | 13 ++++++++-----
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 001ab3ef..0f51a1c6 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,31 @@ Learn more about the features and the API from our [documentation](https://hta.r
 ### Data Preparation
 All traces collected from a job must reside in a unique folder.
 
+An example of trace collection using the PyTorch Profiler is shown below:
+
+```python
+from torch.profiler import profile, schedule, tensorboard_trace_handler
+
+tracing_schedule = schedule(skip_first=5, wait=5, warmup=2, active=2, repeat=1)
+trace_handler = tensorboard_trace_handler(dir_name="traces", use_gzip=True)
+
+NUM_EPOCHS = 10 # arbitrary number of epochs to profile
+
+with profile(
+  activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA],
+  schedule = tracing_schedule,
+  on_trace_ready = trace_handler,
+  profile_memory = True,
+  record_shapes = True,
+  with_stack = True
+) as prof:
+
+   for _ in EPOCHS:
+      for step, batch_data in enumerate(data_loader):
+         train(batch_data)
+         prof.step()
+```
+
 ### Analysis in a Jupyter notebook
 
 Activate the Conda environment and launch a Jupyter notebook.
@@ -94,7 +119,7 @@ jupyter notebook
 Import HTA, and create a `TraceAnalysis` object
 ``` python
 from hta.trace_analysis import TraceAnalysis
-analyzer = TraceAnalysis(trace_dir = "/path/to/folder/containing/the/traces")
+analyzer = TraceAnalysis(trace_dir = "traces") # path to the trace folder
 ```
 
 #### Basic Usage
diff --git a/docs/source/intro/trace_collection.rst b/docs/source/intro/trace_collection.rst
index ca8a04d0..38db9942 100644
--- a/docs/source/intro/trace_collection.rst
+++ b/docs/source/intro/trace_collection.rst
@@ -23,12 +23,14 @@ To profile, wrap the code in the ``profile`` context manager as shown below.
 
 .. code-block:: python
     :linenos:
-    :emphasize-lines: 17
+    :emphasize-lines: 19
 
     from torch.profiler import profile, schedule, tensorboard_trace_handler
 
     tracing_schedule = schedule(skip_first=5, wait=5, warmup=2, active=2, repeat=1)
-    trace_handler = tensorboard_trace_handler(dir_name=/output/folder, use_gzip=True)
+    trace_handler = tensorboard_trace_handler(dir_name="traces", use_gzip=True)
+
+    NUM_EPOCHS = 5 # arbitrary number of epochs to profile
 
     with profile(
       activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA],
@@ -39,9 +41,10 @@ To profile, wrap the code in the ``profile`` context manager as shown below.
       with_stack = True
     ) as prof:
 
-        for step, batch_data in enumerate(data_loader):
-            train(batch_data)
-            prof.step()
+        for _ in range(NUM_EPOCHS):
+          for step, batch_data in enumerate(data_loader):
+              train(batch_data)
+              prof.step()
 
 Line 17 in the code snippet above signals to the profiler that a training
 iteration has completed.

From edb517a19fada9a17b6fa5213d93730b80696883 Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 16:34:17 -0400
Subject: [PATCH 02/12] convert unspecified rank from logger.error to
 ValueError

---
 hta/common/trace_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hta/common/trace_file.py b/hta/common/trace_file.py
index 6244ee44..8e033580 100644
--- a/hta/common/trace_file.py
+++ b/hta/common/trace_file.py
@@ -58,7 +58,7 @@ def create_rank_to_trace_dict(trace_dir: str) -> Tuple[bool, Dict]:
                     )
                 rank_to_trace_dict[int(rank)] = file_path
             else:
-                logger.error(
+                raise ValueError(
                     "If the trace file does not have the rank specified in it, then add the following snippet "
                     'key to the json files to use HTA; "distributedInfo": {"rank": 0}. If there are multiple '
                     "traces files, then each file should have a unique rank value."

From 71784d1beec53cc0fe73985a8f6b70fef5fbdb8a Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 17:52:41 -0400
Subject: [PATCH 03/12] fixed minor spelling mistake

---
 examples/cupti_profiler_demo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cupti_profiler_demo.py b/examples/cupti_profiler_demo.py
index 1a5946d3..d9462fa5 100644
--- a/examples/cupti_profiler_demo.py
+++ b/examples/cupti_profiler_demo.py
@@ -3,7 +3,7 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-Measuring CUPTI performanc metrics using CUPTI Profiler.
+Measuring CUPTI performance metrics using CUPTI Profiler.
 This is supported on V100 and higher NVIDIA GPUs.
 """
 

From 083930c4bfa9d9467b33c01dc189009036ce6572 Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 18:19:31 -0400
Subject: [PATCH 04/12] Added more examples

---
 README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 0f51a1c6..31105890 100644
--- a/README.md
+++ b/README.md
@@ -88,8 +88,8 @@ An example of trace collection using the PyTorch Profiler is shown below:
 ```python
 from torch.profiler import profile, schedule, tensorboard_trace_handler
 
-tracing_schedule = schedule(skip_first=5, wait=5, warmup=2, active=2, repeat=1)
-trace_handler = tensorboard_trace_handler(dir_name="traces", use_gzip=True)
+tracing_schedule = schedule(skip_first = 5, wait = 5, warmup = 2, active = 2, repeat = 1)
+trace_handler = tensorboard_trace_handler(dir_name = "traces/", use_gzip = True)
 
 NUM_EPOCHS = 10 # arbitrary number of epochs to profile
 
@@ -102,7 +102,7 @@ with profile(
   with_stack = True
 ) as prof:
 
-   for _ in EPOCHS:
+   for _ in range(EPOCHS):
       for step, batch_data in enumerate(data_loader):
          train(batch_data)
          prof.step()
@@ -119,7 +119,11 @@ jupyter notebook
 Import HTA, and create a `TraceAnalysis` object
 ``` python
 from hta.trace_analysis import TraceAnalysis
-analyzer = TraceAnalysis(trace_dir = "traces") # path to the trace folder
+analyzer = TraceAnalysis(trace_dir = "traces/") # path to the trace folder
+
+# or 
+
+analyzer = TraceAnalysis(trace_files={0: 'trace_0.json', 1: 'trace_1.json.gz'})
 ```
 
 #### Basic Usage

From 78e0c946aa0fb343f3339db28659e63346a4e8e7 Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 18:20:19 -0400
Subject: [PATCH 05/12] updated code indentation and clarity for cupti sample

---
 .../features/cupti_counter_analysis.rst       | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/docs/source/features/cupti_counter_analysis.rst b/docs/source/features/cupti_counter_analysis.rst
index 168a8359..4a1edbda 100644
--- a/docs/source/features/cupti_counter_analysis.rst
+++ b/docs/source/features/cupti_counter_analysis.rst
@@ -22,18 +22,21 @@ an example.
 .. code-block:: python
 
     with torch.profiler.profile(
-        activities=[torch.profiler.ProfilerActivity.CUDA,
-                    torch.profiler.ProfilerActivity.CPU],
-        record_shapes=True,
-        on_trace_ready=trace_handler,
-        experimental_config=torch.profiler._ExperimentalConfig(
-            profiler_metrics=[
+        activities = [
+            torch.profiler.ProfilerActivity.CUDA,
+            torch.profiler.ProfilerActivity.CPU
+        ],
+        record_shapes = True,
+        on_trace_ready = trace_handler,
+        experimental_config = torch.profiler._ExperimentalConfig(
+            profiler_metrics = [
                 "kineto__tensor_core_insts",
                 "dram__bytes_read.sum",
-                "dram__bytes_write.sum"],
-        profiler_measure_per_kernel=True),
+                "dram__bytes_write.sum"
+            ],
+        profiler_measure_per_kernel = True),
     ) as prof:
-        res = train_batch(modeldef)
+        res = train_batch(model)
         prof.step()
 
 The generated trace contains the following additional information:
@@ -55,7 +58,8 @@ The code below runs CUPTI counter analysis on the collected trace.
 
 .. code-block:: python
 
-   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   analyzer = TraceAnalysis(trace_dir = "traces/")
+
    gpu_kernels = analyzer.get_cupti_counter_data_with_operators(ranks=[0])[0]
 
 It returns a list of dataframes, one per rank or trace file. Each dataframe

From 8cc9139471f4e7a215cc7af1df221c9444b10aa4 Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 18:20:50 -0400
Subject: [PATCH 06/12] fix: spelling

---
 docs/source/features/trace_diff.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/features/trace_diff.rst b/docs/source/features/trace_diff.rst
index 6ed03e8e..255cb78c 100644
--- a/docs/source/features/trace_diff.rst
+++ b/docs/source/features/trace_diff.rst
@@ -53,7 +53,7 @@ follows:
 .. code-block:: python
 
     df = compare_traces_output.sort_values(by="diff_duration", ascending=False)
-    # The duration differerence can be overshadowed by the "ProfilerStep",
+    # The duration difference can be overshadowed by the "ProfilerStep",
     # so we can filter it out to show the trend of other operators.
     df = df.loc[~df.index.str.startswith("ProfilerStep")].head(10)
     TraceDiff.visualize_duration_diff(df)

From d57f6aa2ca72d4ad60f11601dedfc18ef403720c Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 18:21:12 -0400
Subject: [PATCH 07/12] added spacing and fixed line number

---
 docs/source/intro/trace_collection.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/intro/trace_collection.rst b/docs/source/intro/trace_collection.rst
index 38db9942..5b553954 100644
--- a/docs/source/intro/trace_collection.rst
+++ b/docs/source/intro/trace_collection.rst
@@ -27,7 +27,7 @@ To profile, wrap the code in the ``profile`` context manager as shown below.
 
     from torch.profiler import profile, schedule, tensorboard_trace_handler
 
-    tracing_schedule = schedule(skip_first=5, wait=5, warmup=2, active=2, repeat=1)
+    tracing_schedule = schedule(skip_first = 5, wait = 5, warmup = 2, active = 2, repeat = 1)
     trace_handler = tensorboard_trace_handler(dir_name="traces", use_gzip=True)
 
     NUM_EPOCHS = 5 # arbitrary number of epochs to profile
@@ -46,5 +46,5 @@ To profile, wrap the code in the ``profile`` context manager as shown below.
               train(batch_data)
               prof.step()
 
-Line 17 in the code snippet above signals to the profiler that a training
+Line 19 in the code snippet above signals to the profiler that a training
 iteration has completed.

From 74499371eb37fd1d56b85e69790fd87cb730b8f0 Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 18:21:43 -0400
Subject: [PATCH 08/12] added `cd` command to docs

---
 docs/source/intro/installation.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/intro/installation.rst b/docs/source/intro/installation.rst
index 13929bba..5046bac3 100644
--- a/docs/source/intro/installation.rst
+++ b/docs/source/intro/installation.rst
@@ -32,7 +32,10 @@ Install from source
 .. code-block::
 
   # get the source code
-  git clone https://github.com/facebookresearch/HolisticTraceAnalysis.git
+  git clone https://github.com/facebookresearch/HolisticTraceAnalysis
+
+  # move into the cloned directory
+  cd HolisticTraceAnalysis
 
   # execute the command below from the root of the repo
   pip install -e .

From 12ab96535927dce9a2dd5707b1040c7085f0ba89 Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 18:21:57 -0400
Subject: [PATCH 09/12] changed traces example

---
 docs/source/features/augmented_counters.rst       | 2 +-
 docs/source/features/comm_comp_overlap.rst        | 2 +-
 docs/source/features/cuda_kernel_launch_stats.rst | 3 ++-
 docs/source/features/frequent_cuda_kernels.rst    | 3 ++-
 docs/source/features/idle_time_breakdown.rst      | 2 +-
 docs/source/features/kernel_breakdown.rst         | 3 ++-
 docs/source/features/temporal_breakdown.rst       | 3 ++-
 docs/source/intro/using_hta.rst                   | 9 ++++-----
 8 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/docs/source/features/augmented_counters.rst b/docs/source/features/augmented_counters.rst
index 0857ae16..43ce1171 100644
--- a/docs/source/features/augmented_counters.rst
+++ b/docs/source/features/augmented_counters.rst
@@ -25,7 +25,7 @@ API.
 
 .. code-block:: python
 
-  analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+  analyzer = TraceAnalysis(trace_dir = "traces/")
   analyzer.generate_trace_with_counters()
 
 A screenshot of the generated trace file with augmented counters.
diff --git a/docs/source/features/comm_comp_overlap.rst b/docs/source/features/comm_comp_overlap.rst
index 3c9fe5c6..da3af33e 100644
--- a/docs/source/features/comm_comp_overlap.rst
+++ b/docs/source/features/comm_comp_overlap.rst
@@ -19,7 +19,7 @@ Communication computation overlap can be calculated as follows:
 
 .. code-block:: python
 
-   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   analyzer = TraceAnalysis(trace_dir = "traces/")
    overlap_df = analyzer.get_comm_comp_overlap()
 
 The function returns a dataframe containing the overlap percentage
diff --git a/docs/source/features/cuda_kernel_launch_stats.rst b/docs/source/features/cuda_kernel_launch_stats.rst
index 24253177..802a2e51 100644
--- a/docs/source/features/cuda_kernel_launch_stats.rst
+++ b/docs/source/features/cuda_kernel_launch_stats.rst
@@ -12,7 +12,8 @@ CPU operator ending. The kernel launch info can be generated as follows:
 
 .. code-block:: python
 
-  analyzer = TraceAnalysis(trace_dir="/path/to/trace/dir")
+  analyzer = TraceAnalysis(trace_dir = "traces/")
+
   kernel_info_df = analyzer.get_cuda_kernel_launch_stats()
 
 A screenshot of the generated dataframe is given below.
diff --git a/docs/source/features/frequent_cuda_kernels.rst b/docs/source/features/frequent_cuda_kernels.rst
index 05045924..956e1a51 100644
--- a/docs/source/features/frequent_cuda_kernels.rst
+++ b/docs/source/features/frequent_cuda_kernels.rst
@@ -22,7 +22,8 @@ be the same across different ranks.
 
 .. code-block:: python
 
-    analyzer = TraceAnalysis(trace_dir = "/path/to/trace_folder")
+    analyzer = TraceAnalysis(trace_dir = "traces/")
+    
     cuda_sequences_df = analyzer.get_frequent_cuda_kernel_sequences(
         operator_name = "aten::linear",
         output_dir = "/tmp/"
diff --git a/docs/source/features/idle_time_breakdown.rst b/docs/source/features/idle_time_breakdown.rst
index 6ed6f498..3405f2db 100644
--- a/docs/source/features/idle_time_breakdown.rst
+++ b/docs/source/features/idle_time_breakdown.rst
@@ -33,7 +33,7 @@ function. The idle time breakdown can be generated as follows:
 
 .. code-block:: python
 
-  analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+  analyzer = TraceAnalysis(trace_dir = "traces/")
   idle_time_df = analyzer.get_idle_time_breakdown()
 
 .. image:: ../_static/idle_time_breakdown_percentage.png
diff --git a/docs/source/features/kernel_breakdown.rst b/docs/source/features/kernel_breakdown.rst
index 2fa93970..a8f184b3 100644
--- a/docs/source/features/kernel_breakdown.rst
+++ b/docs/source/features/kernel_breakdown.rst
@@ -13,7 +13,8 @@ The kernel breakdown can be calculated as follows:
 
 .. code-block:: python
 
-   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   analyzer = TraceAnalysis(trace_dir = "traces/")
+   
    kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown()
 
 The first dataframe returned by the function contains the raw values used to
diff --git a/docs/source/features/temporal_breakdown.rst b/docs/source/features/temporal_breakdown.rst
index 8c063b59..409cc1f6 100644
--- a/docs/source/features/temporal_breakdown.rst
+++ b/docs/source/features/temporal_breakdown.rst
@@ -25,7 +25,8 @@ The temporal breakdown can be calculated as follows:
 
 .. code-block:: python
 
-   analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder")
+   analyzer = TraceAnalysis(trace_dir = "traces/")
+   
    time_spent_df = analyzer.get_temporal_breakdown()
 
 The function returns a dataframe containing the temporal breakdown for each rank.
diff --git a/docs/source/intro/using_hta.rst b/docs/source/intro/using_hta.rst
index 3c7d6ef0..05fd9f53 100644
--- a/docs/source/intro/using_hta.rst
+++ b/docs/source/intro/using_hta.rst
@@ -13,8 +13,7 @@ Trace Analysis
 .. code-block:: python
 
     from hta.trace_analysis import TraceAnalysis
-    analyzer = TraceAnalysis(trace_dir = "/trace/folder/path")
-
+    analyzer = TraceAnalysis(trace_dir = "traces/")
 
 Using the features is straightforward. E.g.
 
@@ -48,9 +47,9 @@ Using the features is straightforward. E.g.
   cuda_kernel_launch_stats = analyzer.get_cuda_kernel_launch_stats()
 
   # Frequent CUDA kernel sequences
-  frequent_patterns_df = analyzer.get_frequent_cuda_kernel_sequences(operator_name="aten::linear",
-                                                                    output_dir="/output/trace/path"
-                                                                   )
+  frequent_patterns_df = analyzer.get_frequent_cuda_kernel_sequences(
+                                    operator_name="aten::linear", output_dir="/output/trace/path"
+                                 )
 
 To learn more about the features in detail we refer the reader to the
 **Features** section. The features can be tuned by various

From 860e1ee20b74a6f56e3f435f588d9fc20bee079d Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 18:22:59 -0400
Subject: [PATCH 10/12] changed path_to_hta to PATH_TO_TRACES (the folder
 contains traces, not analysis)

---
 examples/identify_stragglers.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/identify_stragglers.ipynb b/examples/identify_stragglers.ipynb
index 4bcb8c0d..30a0eb95 100644
--- a/examples/identify_stragglers.ipynb
+++ b/examples/identify_stragglers.ipynb
@@ -86,7 +86,7 @@
    "outputs": [],
    "source": [
     "# Set path to HolisticTraceAnalysis folder\n",
-    "path_to_hta = \"/path/to/HolisticTraceAnalysis\""
+    "PATH_TO_TRACES = \"traces/\""
    ]
   },
   {
@@ -106,7 +106,7 @@
     "%%time\n",
     "from hta.trace_analysis import TraceAnalysis\n",
     "\n",
-    "trace_dir = path_to_hta + \"/tests/data/vision_transformer\"\n",
+    "trace_dir = PATH_TO_TRACES + \"/tests/data/vision_transformer\"\n",
     "print(trace_dir)\n",
     "analyzer = TraceAnalysis(trace_dir = trace_path)"
    ]

From 45881728347b5acf559b27d83a011ee738d206be Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 18:23:33 -0400
Subject: [PATCH 11/12] formatted example to wrap

---
 examples/kernel_breakdown_demo.ipynb | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/kernel_breakdown_demo.ipynb b/examples/kernel_breakdown_demo.ipynb
index 78c7701a..4aedbcf1 100644
--- a/examples/kernel_breakdown_demo.ipynb
+++ b/examples/kernel_breakdown_demo.ipynb
@@ -245,10 +245,9 @@
     }
    ],
    "source": [
-    "kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown( \n",
-    "                                             num_kernels=5, \n",
-    "                                             include_memory_kernels=True, \n",
-    "                                             image_renderer=\"png\")"
+    "kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown(\n",
+    "    num_kernels=5, include_memory_kernels=True, image_renderer=\"png\"\n",
+    ")"
    ]
   },
   {

From aa25d7696cceb0ad4328936d7fd2892e8ebd7878 Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Thu, 28 Mar 2024 18:36:56 -0400
Subject: [PATCH 12/12] updated num_epochs

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 31105890..ec07989e 100644
--- a/README.md
+++ b/README.md
@@ -102,7 +102,7 @@ with profile(
   with_stack = True
 ) as prof:
 
-   for _ in range(EPOCHS):
+   for _ in range(NUM_EPOCHS):
       for step, batch_data in enumerate(data_loader):
          train(batch_data)
          prof.step()