nit: Fix typos, explicit imports and remove extra comments

KshitijLakhani · KshitijLakhani · commit d0bed1cc9afe · 2025-03-14T13:19:42.000-07:00
Signed-off-by: Kshitij Janardan Lakhani &lt;klakhani@nvidia.com&gt;
diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py
@@ -76,7 +76,7 @@
 
 # Import attention utils
 import transformer_engine.pytorch.dot_product_attention.utils as dpa_utils
-import transformer_engine.pytorch.dot_product_attention.inference as dpa_infer
+from transformer_engine.pytorch.dot_product_attention.inference import InferenceParams
 from transformer_engine.pytorch.dot_product_attention.utils import FlashAttentionUtils as fa_utils
 from transformer_engine.pytorch.dot_product_attention.utils import AttentionLogging as attn_log
 from transformer_engine.pytorch.dot_product_attention.rope import apply_rotary_pos_emb
@@ -5384,7 +5384,7 @@ def forward(
         core_attention_bias: Optional[torch.Tensor] = None,
         alibi_slopes: Optional[torch.Tensor] = None,
         fast_zero_fill: bool = True,
-        inference_params: Optional[dpa_infer.InferenceParams] = None,
+        inference_params: Optional[InferenceParams] = None,
         pad_between_seqs: Optional[bool] = None,
     ) -> torch.Tensor:
         """
@@ -5545,7 +5545,7 @@ def forward(
                      to the attention score of query i and key j.
         fast_zero_fill: bool, default = `True`
                     Whether to use the fast path to set output tensors to 0 or not.
-        inference_params: Optional[dpa_infer.InferenceParams], default = `None`
+        inference_params: Optional[InferenceParams], default = `None`
             Optimizes execution performance during inference by caching Keys and Values of the
             current decoding iteration. These cached values are appended to the K and V values
             computed in previous iterations, eliminating the need to recalculate them for the
@@ -6501,7 +6501,7 @@ def forward(
         window_size: Optional[Tuple[int, int]] = None,
         is_first_microbatch: Optional[bool] = None,
         checkpoint_core_attention: bool = False,
-        inference_params: Optional[dpa_infer.InferenceParams] = None,
+        inference_params: Optional[InferenceParams] = None,
         rotary_pos_emb: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
         core_attention_bias_type: str = "no_bias",
         core_attention_bias: Optional[torch.Tensor] = None,
diff --git a/transformer_engine/pytorch/dot_product_attention/rope.py b/transformer_engine/pytorch/dot_product_attention/rope.py
@@ -3,7 +3,7 @@
 # See LICENSE for license information.
 
 """
-Rotary Position Embedding implementation of different types along with hlper functions
+Rotary Position Embedding implementation of different types along with helper functions
 """
 from typing import Optional, Tuple, Union
 import torch
diff --git a/transformer_engine/pytorch/dot_product_attention/utils.py b/transformer_engine/pytorch/dot_product_attention/utils.py
@@ -46,15 +46,13 @@
 
 from transformer_engine.pytorch.jit import jit_fuser
 
-# ----Global constants----
 # NVTE_DEBUG = 0/1 # disables/enables debug mode, default = 0
 _NVTE_DEBUG = int(os.getenv("NVTE_DEBUG", "0"))
 # NVTE_DEBUG_LEVEL = 0/1/2 # enables more and more verbose debug mode, default = 0
 _NVTE_DEBUG_LEVEL = int(os.getenv("NVTE_DEBUG_LEVEL", "0"))
 _NVTE_FLASH_ATTN = int(os.getenv("NVTE_FLASH_ATTN", "1"))
 
 
-# ----Helper/Util classes and methods-----
 class AttentionLogging:
     """
     Manage logging for attention module