NVIDIA-NeMo · chtruong814 · Nov 1, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025
diff --git a/examples/asr/asr_chunked_inference/ctc/asr_streaming_infer.py b/examples/asr/asr_chunked_inference/ctc/asr_streaming_infer.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script serves as the entry point for local ASR inference, supporting buffered CTC/RNNT/TDT and cache-aware CTC/RNNT inference.
+
+The script performs the following steps:
+    (1) Accepts as input a single audio file, a directory of audio files, or a manifest file.
+        - Note: Input audio files must be 16 kHz, mono-channel WAV files.
+    (2) Creates a recognizer object to run the ASR pipeline.
+    (3) Runs inference on the input audio files.
+    (4) Writes the transcriptions to an output json/jsonl file. Word-level output is written to a separate CTM file.
+
+Example usage:
+python asr_streaming_infer.py \
+        --config-path=./conf \
+        --config-name=config.yaml \
+        audio_file=<path to audio file, directory of audio files, or manifest file> \
+        output_filename=<path to output jsonfile> \
+        lang=en \
+        automatic_punctuation=False \
+        verbatim_transcripts=True \
+        asr_output_granularity=segment \
+        ...
+        # See conf/*.yaml for all available options
+
+Note:
+    The output file is a json file with the following structure:
+    {"audio_filepath": "path/to/audio/file", "text": "transcription of the audio file", "json_filepath": "path/to/json/file"}
+"""
+
+
+from time import time
+
+import hydra
+
+
+from nemo.collections.asr.inference.factory.recognizer_builder import RecognizerBuilder
+from nemo.collections.asr.inference.utils.manifest_io import calculate_duration, dump_output, get_audio_filepaths
+from nemo.collections.asr.inference.utils.progressbar import TQDMProgressBar
+from nemo.utils import logging
+
+# disable nemo_text_processing logging
+try:
+    from nemo_text_processing.utils import logger as nemo_text_logger
+
+    nemo_text_logger.propagate = False
+except ImportError:
+    # NB: nemo_text_processing requires pynini, which is tricky to install on MacOS
+    # since nemo_text_processing is not necessary for ASR, wrap the import
+    logging.warning("NeMo text processing library is unavailable.")
+
+
+@hydra.main(version_base=None)
+def main(cfg):
+
+    # Set the logging level
+    logging.setLevel(cfg.log_level)
+
+    # Reading audio filepaths
+    audio_filepaths = get_audio_filepaths(cfg.audio_file, sort_by_duration=True)
+    logging.info(f"Found {len(audio_filepaths)} audio files")
+
+    # Build the recognizer
+    recognizer = RecognizerBuilder.build_recognizer(cfg)
+    progress_bar = TQDMProgressBar()
+
+    # Run the recognizer
+    start = time()
+    output = recognizer.run(audio_filepaths, progress_bar=progress_bar)
+    exec_dur = time() - start
+
+    # Calculate RTFX
+    data_dur = calculate_duration(audio_filepaths)
+    rtfx = data_dur / exec_dur if exec_dur > 0 else float('inf')
+    logging.info(f"RTFX: {rtfx:.2f} ({data_dur:.2f}s / {exec_dur:.2f}s)")
+
+    # Dump the transcriptions to a output file
+    dump_output(audio_filepaths, output, cfg.output_filename, cfg.output_dir)
+    logging.info(f"Transcriptions written to {cfg.output_filename}")
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/asr/asr_chunked_inference/ctc/conf/asr/cache_aware_asr_config.yaml b/examples/asr/asr_chunked_inference/ctc/conf/asr/cache_aware_asr_config.yaml
@@ -0,0 +1,19 @@
+# ================================
+# Cache-Aware ASR Configuration
+# ================================
+
+model_name: stt_en_fastconformer_hybrid_large_streaming_multi  # Pre-trained streaming model from NGC/HuggingFace or local .nemo file path
+
+# ========================
+# Device Configuration
+# ========================
+device: cuda                                   # Device for inference: 'cuda' or 'cpu'
+device_id: 0                                   # GPU device ID
+compute_dtype: bfloat16                        # Compute precision: 'bfloat16' for Ampere+, 
+                                               #                    'float16' for older GPUs 
+                                               #                    'float32'
+
+# ========================
+# Mixed Precision Settings
+# ========================
+use_amp: true                                  # Enable Automatic Mixed Precision
diff --git a/examples/asr/asr_chunked_inference/ctc/conf/asr/ctc_asr_config.yaml b/examples/asr/asr_chunked_inference/ctc/conf/asr/ctc_asr_config.yaml
@@ -0,0 +1,19 @@
+# ================================
+# CTC ASR Configuration
+# ================================
+
+model_name: nvidia/parakeet-ctc-1.1b           # Pre-trained CTC/hybrid model from NGC/HuggingFace or local .nemo file path
+
+# ========================
+# Device Configuration
+# ========================
+device: cuda                                   # Device for inference: 'cuda' or 'cpu'
+device_id: 0                                   # GPU device ID
+compute_dtype: bfloat16                        # Compute precision: 'bfloat16' for Ampere+, 
+                                               #                    'float16' for older GPUs 
+                                               #                    'float32'
+
+# ========================
+# Mixed Precision Settings
+# ========================
+use_amp: true                                  # Enable Automatic Mixed Precision
diff --git a/examples/asr/asr_chunked_inference/ctc/conf/asr/rnnt_asr_config.yaml b/examples/asr/asr_chunked_inference/ctc/conf/asr/rnnt_asr_config.yaml
@@ -0,0 +1,25 @@
+# ================================
+# RNNT ASR Configuration
+# ================================
+
+model_name: nvidia/parakeet-rnnt-1.1b          # Pre-trained RNNT/hybrid model from NGC/HuggingFace or local .nemo file path
+
+# ========================
+# Device Configuration
+# ========================
+device: cuda                                   # Device for inference: 'cuda' or 'cpu'
+device_id: 0                                   # GPU device ID
+compute_dtype: bfloat16                        # Compute precision: 'bfloat16' for Ampere+, 
+                                               #                    'float16' for older GPUs 
+                                               #                    'float32'
+
+# ========================
+# Mixed Precision Settings
+# ========================
+use_amp: true                                  # Enable Automatic Mixed Precision
+
+# ========================
+# Language Model Settings
+# ========================
+ngram_lm_model: ""                            # Path to ngram language model
+ngram_lm_alpha: 0.0                           # Alpha for language model
diff --git a/examples/asr/asr_chunked_inference/ctc/conf/buffered_ctc.yaml b/examples/asr/asr_chunked_inference/ctc/conf/buffered_ctc.yaml
@@ -0,0 +1,82 @@
+# ================================
+# Default configurations
+# ================================
+defaults:
+  - _self_
+  - asr: ctc_asr_config                       # ASR configuration
+  - pnc: punctuation_capitalization_config    # Punctuation & capitalization model config
+  - itn: inverse_normalization_config         # Inverse text normalization config
+
+
+# ========================
+# Confidence estimation
+# ========================
+confidence:
+  exclude_blank: true                         # Exclude blank tokens when calculating confidence
+  aggregation: mean                           # Aggregation method for confidence across time steps
+  method_cfg:
+    name: entropy                             # Confidence estimation method: 'max_prob' or 'entropy'
+    entropy_type: tsallis                     
+    alpha: 0.5                                
+    entropy_norm: exp                         
+
+# ========================
+# Endpointing settings
+# ========================
+endpointing:
+  stop_history_eou: 800                       # Time window (ms) for evaluating EoU
+  residue_tokens_at_end: 2                    # Number of residual tokens used for EoU
+
+
+# ========================
+# Streaming configuration
+# ========================
+streaming:
+  sample_rate: 16000                          # Audio sample rate in Hz
+  batch_size: 256                             # Number of audio frames per batch
+  left_padding_size: 1.6                      # Left padding duration in seconds
+  right_padding_size: 1.6                     # Right padding duration in seconds
+  chunk_size: 4.8                             # Audio chunk size in seconds
+  word_boundary_tolerance: 4                  # Tolerance for word boundaries
+  request_type: feature_buffer                # Type of request: frame or feature_buffer
+  padding_mode: right                         # Padding mode: left or right. How to pad frames to match the required buffer length
+
+
+# ============================
+# Text postprocessing settings
+# ============================
+text_postprocessor:
+  force_to_use_pnc_model: false               # Force use of BERT based PnC restoration model
+  pnc:
+    left_padding_search_size: 45              # Look-back window (#words) for punctuation context
+    batch_size: 128                           # Batch size for PnC model inference
+    max_seq_length: 64                        # Max sequence length processed at once
+    step: 8                                   # Sliding step size
+    margin: 16                                # Overlap between windows to ensure smooth transitions
+  itn:
+    left_padding_size: 4                      # Padding size (#spans) for ITN context
+    batch_size: 32                            # Batch size for ITN inference
+    n_jobs: 16                                # Number of parallel jobs for ITN processing
+
+
+# ========================
+# Recognizer settings
+# ========================
+matmul_precision: high                        # Matrix multiplication precision: highest, high, medium
+log_level: 20                                 # Logging level: 0 (NOTSET), 10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL)
+recognizer_type: buffered                     # Recognizer type: buffered, cache_aware
+asr_decoding_type: ctc                        # Decoding method: ctc or rnnt
+
+
+# ========================
+# Runtime arguments defined at runtime   via command line
+# ========================
+audio_file: null                              # Path to audio file, directory, or manifest JSON
+output_filename: null                         # Path to output transcription JSON file
+output_dir: null                              # Directory to save time-aligned output
+automatic_punctuation: false                  # Whether to apply punctuation & capitalization
+verbatim_transcripts: true                    # Whether to apply inverse text normalization
+asr_output_granularity: segment               # Output granularity: word or segment
+cache_dir: null                               # Directory to store cache (e.g., .far files)
+lang: null                                    # Language code for ASR model
+return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer
diff --git a/examples/asr/asr_chunked_inference/ctc/conf/buffered_rnnt.yaml b/examples/asr/asr_chunked_inference/ctc/conf/buffered_rnnt.yaml
@@ -0,0 +1,84 @@
+# ================================
+# Default configurations
+# ================================
+defaults:
+  - _self_
+  - asr: rnnt_asr_config                      # ASR configuration
+  - pnc: punctuation_capitalization_config    # Punctuation & capitalization model config
+  - itn: inverse_normalization_config         # Inverse text normalization config
+
+
+# ========================
+# Confidence estimation
+# ========================
+confidence:
+  exclude_blank: true                         # Exclude blank tokens when calculating confidence
+  aggregation: mean                           # Aggregation method for confidence across time steps
+  method_cfg:
+    name: entropy                             # Confidence estimation method: 'max_prob' or 'entropy'
+    entropy_type: tsallis                     
+    alpha: 0.5                                
+    entropy_norm: exp 
+
+
+# ========================
+# Endpointing settings
+# ========================
+endpointing:
+  stop_history_eou: 800                       # Time window (ms) for evaluating EoU
+  residue_tokens_at_end: 2                    # Number of residual tokens used for EoU
+
+
+# ========================
+# Streaming configuration
+# ========================
+streaming:
+  sample_rate: 16000                          # Audio sample rate in Hz
+  batch_size: 256                             # Number of audio frames per batch
+  left_padding_size: 1.6                      # Left padding duration in seconds
+  right_padding_size: 1.6                     # Right padding duration in seconds
+  chunk_size: 4.8                             # Audio chunk size in seconds
+  word_boundary_tolerance: 4                  # Tolerance for word boundaries
+  request_type: feature_buffer                # Type of request: frame or feature_buffer
+  stateful: true                              # Whether to use stateful processing
+  padding_mode: right                         # Padding mode: left or right. How to pad frames to match the required buffer length
+
+
+# ============================
+# Text postprocessing settings
+# ============================
+text_postprocessor:
+  force_to_use_pnc_model: false               # Force use of BERT based PnC restoration model
+  pnc:
+    left_padding_search_size: 45              # Look-back window (#words) for punctuation context
+    batch_size: 128                           # Batch size for PnC model inference
+    max_seq_length: 64                        # Max sequence length processed at once
+    step: 8                                   # Sliding step size
+    margin: 16                                # Overlap between windows to ensure smooth transitions
+  itn:
+    left_padding_size: 4                      # Padding size (#spans) for ITN context
+    batch_size: 32                            # Batch size for ITN inference
+    n_jobs: 16                                # Number of parallel jobs for ITN processing
+
+
+# ========================
+# Recognizer settings
+# ========================
+matmul_precision: high                     # Matrix multiplication precision: highest, high, medium
+log_level: 20                              # Logging level: 0 (NOTSET), 10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL)
+recognizer_type: buffered                  # Recognizer type: buffered, cache_aware
+asr_decoding_type: rnnt                    # Decoding method: ctc or rnnt
+
+
+# ========================
+# Runtime arguments defined at runtime   via command line
+# ========================
+audio_file: null                              # Path to audio file, directory, or manifest JSON
+output_filename: null                         # Path to output transcription JSON file
+output_dir: null                              # Directory to save time-aligned output
+automatic_punctuation: false                  # Whether to apply punctuation & capitalization
+verbatim_transcripts: true                    # Whether to apply inverse text normalization
+asr_output_granularity: segment               # Output granularity: word or segment
+cache_dir: null                               # Directory to store cache (e.g., .far files)
+lang: null                                    # Language code for ASR model
+return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer