NVIDIA-NeMo · chtruong814 · Nov 1, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025
@@ -129,6 +129,8 @@ jobs:
             script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Streaming_Inference
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
           - runner: self-hosted-azure

diff --git a/examples/asr/asr_chunked_inference/README.md b/examples/asr/asr_chunked_inference/README.md
@@ -13,3 +13,4 @@ On the other hand, if you increase your chunk size, then the delay between spoke
 ## Chunked Inference
 
 For MultitaskAED models, we provide a script to perform chunked inference. This script will split the input audio into non-overlapping chunks and perform inference on each chunk. The script will then concatenate the results to provide the final transcript.
+
diff --git a/examples/asr/asr_streaming_inference/README.md b/examples/asr/asr_streaming_inference/README.md
@@ -0,0 +1,11 @@
+# Universal Streaming Inference
+
+The `asr_streaming_infer.py` script enables streaming inference for both buffered (CTC/RNNT/TDT) and cache-aware (CTC/RNNT) ASR models. It supports processing a single audio file, a directory of audio files, or a manifest file.
+
+Beyond streaming ASR, the script also supports:
+
+* **Inverse Text Normalization (ITN)**
+* **End-of-Utterance (EoU) Detection**
+* **Word-level and Segment-level Output**
+
+All related configurations can be found in the `../conf/asr_streaming_inference/` directory.
diff --git a/examples/asr/asr_streaming_inference/asr_streaming_infer.py b/examples/asr/asr_streaming_inference/asr_streaming_infer.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script serves as the entry point for local ASR inference, supporting buffered CTC/RNNT/TDT and cache-aware CTC/RNNT inference.
+
+The script performs the following steps:
+    (1) Accepts as input a single audio file, a directory of audio files, or a manifest file.
+        - Note: Input audio files must be 16 kHz, mono-channel WAV files.
+    (2) Creates a pipeline object to perform inference.
+    (3) Runs inference on the input audio files.
+    (4) Writes the transcriptions to an output json/jsonl file. Word/Segment level output is written to a separate JSON file.
+
+Example usage:
+python asr_streaming_infer.py \
+        --config-path=../conf/asr_streaming_inference/ \
+        --config-name=config.yaml \
+        audio_file=<path to audio file, directory of audio files, or manifest file> \
+        output_filename=<path to output jsonfile> \
+        lang=en \
+        enable_pnc=False \
+        enable_itn=True \
+        asr_output_granularity=segment \
+        ...
+        # See ../conf/asr_streaming_inference/*.yaml for all available options
+
+Note:
+    The output file is a json file with the following structure:
+    {"audio_filepath": "path/to/audio/file", "text": "transcription of the audio file", "json_filepath": "path/to/json/file"}
+"""
+
+
+from time import time
+
+import hydra
+
+
+from nemo.collections.asr.inference.factory.pipeline_builder import PipelineBuilder
+from nemo.collections.asr.inference.utils.manifest_io import calculate_duration, dump_output, get_audio_filepaths
+from nemo.collections.asr.inference.utils.progressbar import TQDMProgressBar
+from nemo.utils import logging
+
+# disable nemo_text_processing logging
+try:
+    from nemo_text_processing.utils import logger as nemo_text_logger
+
+    nemo_text_logger.propagate = False
+except ImportError:
+    # NB: nemo_text_processing requires pynini, which is tricky to install on MacOS
+    # since nemo_text_processing is not necessary for ASR, wrap the import
+    logging.warning("NeMo text processing library is unavailable.")
+
+
+@hydra.main(version_base=None)
+def main(cfg):
+
+    # Set the logging level
+    logging.setLevel(cfg.log_level)
+
+    # Reading audio filepaths
+    audio_filepaths = get_audio_filepaths(cfg.audio_file, sort_by_duration=True)
+    logging.info(f"Found {len(audio_filepaths)} audio files")
+
+    # Build the pipeline
+    pipeline = PipelineBuilder.build_pipeline(cfg)
+    progress_bar = TQDMProgressBar()
+
+    # Run the pipeline
+    start = time()
+    output = pipeline.run(audio_filepaths, progress_bar=progress_bar)
+    exec_dur = time() - start
+
+    # Calculate RTFX
+    data_dur = calculate_duration(audio_filepaths)
+    rtfx = data_dur / exec_dur if exec_dur > 0 else float('inf')
+    logging.info(f"RTFX: {rtfx:.2f} ({data_dur:.2f}s / {exec_dur:.2f}s)")
+
+    # Dump the transcriptions to a output file
+    dump_output(output, cfg.output_filename, cfg.output_dir)
+    logging.info(f"Transcriptions written to {cfg.output_filename}")
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/asr/conf/asr_streaming_inference/buffered_ctc.yaml b/examples/asr/conf/asr_streaming_inference/buffered_ctc.yaml
@@ -0,0 +1,80 @@
+# ================================
+# ASR Configuration
+# ================================
+asr:
+  model_name: nvidia/parakeet-ctc-1.1b         # Pre-trained CTC/hybrid model from NGC/HuggingFace or local .nemo file path
+  device: cuda                                 # Device for inference: 'cuda' or 'cpu'
+  device_id: 0                                 # GPU device ID
+  compute_dtype: bfloat16                      # Compute precision: 'bfloat16' for Ampere+, 'float16' for older GPUs, or 'float32' 
+  use_amp: false                               # Enable Automatic Mixed Precision
+
+
+# ==========================================
+# Inverse Text Normalization Configuration
+# ==========================================
+itn:
+  input_case: lower_cased                       # Input text case handling: 'lower_cased', 'cased'
+  whitelist: null                               # Custom whitelist for ITN processing
+  overwrite_cache: false                        # Whether to overwrite existing cache files
+  max_number_of_permutations_per_split: 729     # Maximum permutations allowed per text split during ITN processing
+  left_padding_size: 4                          # Padding size (#spans) for ITN context
+  batch_size: 32                                # Batch size for ITN inference
+  n_jobs: 16                                    # Number of parallel jobs for ITN processing
+
+
+# ========================
+# Confidence estimation
+# ========================
+confidence:
+  exclude_blank: true                         # Exclude blank tokens when calculating confidence
+  aggregation: mean                           # Aggregation method for confidence across time steps
+  method_cfg:
+    name: entropy                             # Confidence estimation method: 'max_prob' or 'entropy'
+    entropy_type: tsallis                     
+    alpha: 0.5                                
+    entropy_norm: exp                         
+
+
+# ========================
+# Endpointing settings
+# ========================
+endpointing:
+  stop_history_eou: 800                       # Time window (ms) for evaluating EoU
+  residue_tokens_at_end: 2                    # Number of residual tokens used for EoU
+
+
+# ========================
+# Streaming configuration
+# ========================
+streaming:
+  sample_rate: 16000                          # Audio sample rate in Hz
+  batch_size: 256                             # Number of audio frames per batch
+  left_padding_size: 1.6                      # Left padding duration in seconds
+  right_padding_size: 1.6                     # Right padding duration in seconds
+  chunk_size: 4.8                             # Audio chunk size in seconds
+  word_boundary_tolerance: 4                  # Tolerance for word boundaries
+  request_type: feature_buffer                # Type of request: frame or feature_buffer
+  padding_mode: right                         # Padding mode: left or right. How to pad frames to match the required buffer length
+
+
+# ========================
+# Pipeline settings
+# ========================
+matmul_precision: high                        # Matrix multiplication precision: highest, high, medium
+log_level: 20                                 # Logging level: 0 (NOTSET), 10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL)
+pipeline_type: buffered                       # Pipeline type: buffered, cache_aware
+asr_decoding_type: ctc                        # Decoding method: ctc or rnnt
+
+
+# ========================
+# Runtime arguments defined at runtime   via command line
+# ========================
+audio_file: null                              # Path to audio file, directory, or manifest JSON
+output_filename: null                         # Path to output transcription JSON file
+output_dir: null                              # Directory to save time-aligned output
+enable_pnc: false                             # Whether to apply punctuation & capitalization
+enable_itn: false                             # Whether to apply inverse text normalization
+asr_output_granularity: segment               # Output granularity: word or segment
+cache_dir: null                               # Directory to store cache (e.g., .far files)
+lang: null                                    # Language code for ASR model
+return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer
diff --git a/examples/asr/conf/asr_streaming_inference/buffered_rnnt.yaml b/examples/asr/conf/asr_streaming_inference/buffered_rnnt.yaml
@@ -0,0 +1,83 @@
+# ================================
+# ASR Configuration
+# ================================
+asr:
+  model_name: nvidia/parakeet-rnnt-1.1b        # Pre-trained RNNT/hybrid model from NGC/HuggingFace or local .nemo file path
+  device: cuda                                 # Device for inference: 'cuda' or 'cpu'
+  device_id: 0                                 # GPU device ID
+  compute_dtype: bfloat16                      # Compute precision: 'bfloat16' for Ampere+, 'float16' for older GPUs, or 'float32'
+  use_amp: false                               # Enable Automatic Mixed Precision
+  ngram_lm_model: ""                           # Path to ngram language model
+  ngram_lm_alpha: 0.0                          # Alpha for language model
+
+
+# ==========================================
+# Inverse Text Normalization Configuration
+# ==========================================
+itn:
+  input_case: lower_cased                       # Input text case handling: 'lower_cased', 'cased'
+  whitelist: null                               # Custom whitelist for ITN processing
+  overwrite_cache: false                        # Whether to overwrite existing cache files
+  max_number_of_permutations_per_split: 729     # Maximum permutations allowed per text split during ITN processing
+  left_padding_size: 4                          # Padding size (#spans) for ITN context
+  batch_size: 32                                # Batch size for ITN inference
+  n_jobs: 16                                    # Number of parallel jobs for ITN processing
+
+
+# ========================
+# Confidence estimation
+# ========================
+confidence:
+  exclude_blank: true                         # Exclude blank tokens when calculating confidence
+  aggregation: mean                           # Aggregation method for confidence across time steps
+  method_cfg:
+    name: entropy                             # Confidence estimation method: 'max_prob' or 'entropy'
+    entropy_type: tsallis                     
+    alpha: 0.5                                
+    entropy_norm: exp 
+
+
+# ========================
+# Endpointing settings
+# ========================
+endpointing:
+  stop_history_eou: 800                       # Time window (ms) for evaluating EoU
+  residue_tokens_at_end: 2                    # Number of residual tokens used for EoU
+
+
+# ========================
+# Streaming configuration
+# ========================
+streaming:
+  sample_rate: 16000                          # Audio sample rate in Hz
+  batch_size: 256                             # Number of audio frames per batch
+  left_padding_size: 1.6                      # Left padding duration in seconds
+  right_padding_size: 1.6                     # Right padding duration in seconds
+  chunk_size: 4.8                             # Audio chunk size in seconds
+  word_boundary_tolerance: 4                  # Tolerance for word boundaries
+  request_type: feature_buffer                # Type of request: frame or feature_buffer
+  stateful: true                              # Whether to use stateful processing
+  padding_mode: right                         # Padding mode: left or right. How to pad frames to match the required buffer length
+
+
+# ========================
+# Pipeline settings
+# ========================
+matmul_precision: high                     # Matrix multiplication precision: highest, high, medium
+log_level: 20                              # Logging level: 0 (NOTSET), 10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL)
+pipeline_type: buffered                    # Pipeline type: buffered, cache_aware
+asr_decoding_type: rnnt                    # Decoding method: ctc or rnnt
+
+
+# ========================
+# Runtime arguments defined at runtime   via command line
+# ========================
+audio_file: null                              # Path to audio file, directory, or manifest JSON
+output_filename: null                         # Path to output transcription JSON file
+output_dir: null                              # Directory to save time-aligned output
+enable_pnc: false                             # Whether to apply punctuation & capitalization
+enable_itn: false                             # Whether to apply inverse text normalization
+asr_output_granularity: segment               # Output granularity: word or segment
+cache_dir: null                               # Directory to store cache (e.g., .far files)
+lang: null                                    # Language code for ASR model
+return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer
diff --git a/examples/asr/conf/asr_streaming_inference/cache_aware_ctc.yaml b/examples/asr/conf/asr_streaming_inference/cache_aware_ctc.yaml
@@ -0,0 +1,80 @@
+# ================================
+# ASR Configuration
+# ================================
+asr:
+  model_name: stt_en_fastconformer_hybrid_large_streaming_multi         # Pre-trained CTC/hybrid model from NGC/HuggingFace or local .nemo file path
+  device: cuda                                                          # Device for inference: 'cuda' or 'cpu'
+  device_id: 0                                                          # GPU device ID
+  compute_dtype: bfloat16                                               # Compute precision: 'bfloat16' for Ampere+, 'float16' for older GPUs, or 'float32'
+  use_amp: true                                                         # Enable Automatic Mixed Precision
+
+
+# ==========================================
+# Inverse Text Normalization Configuration
+# ==========================================
+itn:
+  input_case: lower_cased                       # Input text case handling: 'lower_cased', 'cased'
+  whitelist: null                               # Custom whitelist for ITN processing
+  overwrite_cache: false                        # Whether to overwrite existing cache files
+  max_number_of_permutations_per_split: 729     # Maximum permutations allowed per text split during ITN processing
+  left_padding_size: 4                          # Padding size (#spans) for ITN context
+  batch_size: 32                                # Batch size for ITN inference
+  n_jobs: 16                                    # Number of parallel jobs for ITN processing
+
+
+# ========================
+# Confidence estimation
+# ========================
+confidence:
+  exclude_blank: true                         # Exclude blank tokens when calculating confidence
+  aggregation: mean                           # Aggregation method for confidence across time steps
+  method_cfg:
+    name: entropy                             # Confidence estimation method: 'max_prob' or 'entropy'
+    entropy_type: tsallis                     
+    alpha: 0.5                                
+    entropy_norm: exp        
+
+
+# ========================
+# Endpointing settings
+# ========================
+endpointing:
+  stop_history_eou: 800                       # Time window (ms) for evaluating EoU
+  residue_tokens_at_end: 2                    # Number of residual tokens used for EoU
+
+
+# ========================
+# Streaming configuration
+# ========================
+streaming:
+  sample_rate: 16000                          # Audio sample rate in Hz
+  batch_size: 256                             # Number of audio frames per batch
+  word_boundary_tolerance: 4                  # Tolerance for word boundaries
+  att_context_size: [70,13]                   # Attention context size: [70,13],[70,6],[70,1],[70,0]
+  use_cache: true                             # Whether to use cache for streaming
+  use_feat_cache: true                        # Whether to cache mel-spec features, set false to re-calculate all mel-spec features in audio buffer
+  chunk_size_in_secs: null                    # Amount of audio to load for each streaming step, e.g., 0.08s for FastConformer. Set to `null` for using default size equal to 1+lookahead frames.
+  request_type: frame                         # Type of request: frame, only frame is supported for cache-aware streaming
+  num_slots: 1024                             # Number of slots in the context manager: must be >= batch_size
+
+
+# ========================
+# Pipeline settings
+# ========================
+matmul_precision: high                        # Matrix multiplication precision: highest, high, medium
+log_level: 20                                 # Logging level: 0 (NOTSET), 10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL)
+pipeline_type: cache_aware                    # Pipeline type: buffered, cache_aware
+asr_decoding_type: ctc                        # Decoding method: ctc or rnnt
+
+# ========================
+# Runtime arguments defined at runtime via command line
+# ========================
+audio_file: null                              # Path to audio file, directory, or manifest JSON
+output_filename: null                         # Path to output transcription JSON file
+output_dir: null                              # Directory to save time-aligned output
+enable_pnc: false                             # Whether to apply punctuation & capitalization
+enable_itn: false                             # Whether to apply inverse text normalization
+asr_output_granularity: segment               # Output granularity: word or segment
+cache_dir: null                               # Directory to store cache (e.g., .far files)
+lang: null                                    # Language code for ASR model
+return_tail_result: false                     # Whether to return the tail labels left in the right padded side of the buffer
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,3 +13,4 @@ On the other hand, if you increase your chunk size, then the delay between spoke
		## Chunked Inference

		For MultitaskAED models, we provide a script to perform chunked inference. This script will split the input audio into non-overlapping chunks and perform inference on each chunk. The script will then concatenate the results to provide the final transcript.