-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Unified inference of streaming ASR #14817
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 74 commits
3c8391d
1b1cb59
25c57a8
cb2c7f3
ab47f96
c59bc2b
7c30d09
76ff0c4
b3aeb99
cd4eb39
3a5bf54
397a950
08f5203
2f0717d
f00102c
77d5ffb
6e83d6b
2cbdf2d
c19bf72
4800a7c
194a507
5c6e97f
9ee2364
8f65ebf
af6e1ef
e401e6f
18a3e3b
2da1769
13ff6ec
da48a7a
68502ee
55b020e
6f3fed1
42f738f
010213c
a6b9c19
be51fc7
6360dd0
72e3115
1ccd6e5
1f2d381
e68107e
a85520a
efd06b2
fa57b30
022f4eb
f3e0099
d780841
d7d7b74
929a9ab
2ba57fa
971d2b5
088a7c3
2678ad9
4570eb1
c9aaff0
7309846
ade13e5
baae7eb
b3cf0c0
637094e
b88d22c
bbe9020
1da433a
75926e1
80f780b
e2f229a
e2997b0
9a4dbaa
b54d7c2
557b66b
14d671c
e776d55
80b43fe
ca4bae8
565ba2d
6b59370
0348ea3
5e86413
04f49da
0ba2e71
07999dd
51bcec0
4682688
a946876
f236413
e34ed1e
d5537b3
5a7f6b3
50b8de2
60a6d78
f56ad1a
ef02c1a
e366e63
b144d04
49ee2f1
5bce5c1
793c90d
42d9aec
1ec2c4b
68d549b
3a17281
5b63a25
c45b6aa
c8e5b35
4a3585a
7dd54fa
5a7a1ce
cb5bf78
02d5b48
9044aa5
65f2007
66573f0
444733f
bc9042f
525b167
a2cf86d
a54196d
b9eb2ec
d0510c4
e0a0de3
3eba8d8
b1369c2
511fd38
5299973
d55ad43
fdc168c
0328dbf
5cb1a65
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """ | ||
| This script serves as the entry point for local ASR inference, supporting buffered CTC/RNNT/TDT and cache-aware CTC/RNNT inference. | ||
|
|
||
| The script performs the following steps: | ||
| (1) Accepts as input a single audio file, a directory of audio files, or a manifest file. | ||
| - Note: Input audio files must be 16 kHz, mono-channel WAV files. | ||
| (2) Creates a recognizer object to run the ASR pipeline. | ||
| (3) Runs inference on the input audio files. | ||
| (4) Writes the transcriptions to an output json/jsonl file. Word-level output is written to a separate CTM file. | ||
|
|
||
| Example usage: | ||
| python asr_streaming_infer.py \ | ||
| --config-path=./conf \ | ||
| --config-name=config.yaml \ | ||
| audio_file=<path to audio file, directory of audio files, or manifest file> \ | ||
| output_filename=<path to output jsonfile> \ | ||
| lang=en \ | ||
| automatic_punctuation=False \ | ||
| verbatim_transcripts=True \ | ||
| asr_output_granularity=segment \ | ||
| ... | ||
| # See conf/*.yaml for all available options | ||
|
|
||
| Note: | ||
| The output file is a json file with the following structure: | ||
| {"audio_filepath": "path/to/audio/file", "text": "transcription of the audio file", "json_filepath": "path/to/json/file"} | ||
| """ | ||
|
|
||
|
|
||
| from time import time | ||
|
|
||
| import hydra | ||
|
|
||
|
|
||
| from nemo.collections.asr.inference.factory.recognizer_builder import RecognizerBuilder | ||
| from nemo.collections.asr.inference.utils.manifest_io import calculate_duration, dump_output, get_audio_filepaths | ||
| from nemo.collections.asr.inference.utils.progressbar import TQDMProgressBar | ||
| from nemo.utils import logging | ||
|
|
||
| # disable nemo_text_processing logging | ||
| try: | ||
| from nemo_text_processing.utils import logger as nemo_text_logger | ||
|
|
||
| nemo_text_logger.propagate = False | ||
| except ImportError: | ||
| # NB: nemo_text_processing requires pynini, which is tricky to install on MacOS | ||
| # since nemo_text_processing is not necessary for ASR, wrap the import | ||
| logging.warning("NeMo text processing library is unavailable.") | ||
|
|
||
|
|
||
| @hydra.main(version_base=None) | ||
| def main(cfg): | ||
|
|
||
| # Set the logging level | ||
| logging.setLevel(cfg.log_level) | ||
|
|
||
| # Reading audio filepaths | ||
| audio_filepaths = get_audio_filepaths(cfg.audio_file, sort_by_duration=True) | ||
| logging.info(f"Found {len(audio_filepaths)} audio files") | ||
|
|
||
| # Build the recognizer | ||
| recognizer = RecognizerBuilder.build_recognizer(cfg) | ||
| progress_bar = TQDMProgressBar() | ||
|
|
||
| # Run the recognizer | ||
| start = time() | ||
| output = recognizer.run(audio_filepaths, progress_bar=progress_bar) | ||
| exec_dur = time() - start | ||
|
|
||
| # Calculate RTFX | ||
| data_dur = calculate_duration(audio_filepaths) | ||
| rtfx = data_dur / exec_dur if exec_dur > 0 else float('inf') | ||
| logging.info(f"RTFX: {rtfx:.2f} ({data_dur:.2f}s / {exec_dur:.2f}s)") | ||
|
|
||
| # Dump the transcriptions to a output file | ||
| dump_output(audio_filepaths, output, cfg.output_filename, cfg.output_dir) | ||
| logging.info(f"Transcriptions written to {cfg.output_filename}") | ||
| logging.info("Done!") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| # ================================ | ||
| # Cache-Aware ASR Configuration | ||
| # ================================ | ||
|
|
||
| model_name: stt_en_fastconformer_hybrid_large_streaming_multi # Pre-trained streaming model from NGC/HuggingFace or local .nemo file path | ||
|
|
||
| # ======================== | ||
| # Device Configuration | ||
| # ======================== | ||
| device: cuda # Device for inference: 'cuda' or 'cpu' | ||
| device_id: 0 # GPU device ID | ||
| compute_dtype: bfloat16 # Compute precision: 'bfloat16' for Ampere+, | ||
| # 'float16' for older GPUs | ||
| # 'float32' | ||
|
|
||
| # ======================== | ||
| # Mixed Precision Settings | ||
| # ======================== | ||
| use_amp: true # Enable Automatic Mixed Precision | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| # ================================ | ||
| # CTC ASR Configuration | ||
| # ================================ | ||
|
|
||
| model_name: nvidia/parakeet-ctc-1.1b # Pre-trained CTC/hybrid model from NGC/HuggingFace or local .nemo file path | ||
|
|
||
| # ======================== | ||
| # Device Configuration | ||
| # ======================== | ||
| device: cuda # Device for inference: 'cuda' or 'cpu' | ||
| device_id: 0 # GPU device ID | ||
| compute_dtype: bfloat16 # Compute precision: 'bfloat16' for Ampere+, | ||
| # 'float16' for older GPUs | ||
| # 'float32' | ||
|
|
||
| # ======================== | ||
| # Mixed Precision Settings | ||
| # ======================== | ||
| use_amp: true # Enable Automatic Mixed Precision |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| # ================================ | ||
artbataev marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| # RNNT ASR Configuration | ||
| # ================================ | ||
|
|
||
| model_name: nvidia/parakeet-rnnt-1.1b # Pre-trained RNNT/hybrid model from NGC/HuggingFace or local .nemo file path | ||
|
|
||
| # ======================== | ||
| # Device Configuration | ||
| # ======================== | ||
| device: cuda # Device for inference: 'cuda' or 'cpu' | ||
| device_id: 0 # GPU device ID | ||
| compute_dtype: bfloat16 # Compute precision: 'bfloat16' for Ampere+, | ||
| # 'float16' for older GPUs | ||
| # 'float32' | ||
|
|
||
| # ======================== | ||
| # Mixed Precision Settings | ||
| # ======================== | ||
| use_amp: true # Enable Automatic Mixed Precision | ||
|
|
||
| # ======================== | ||
| # Language Model Settings | ||
| # ======================== | ||
| ngram_lm_model: "" # Path to ngram language model | ||
| ngram_lm_alpha: 0.0 # Alpha for language model | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| # ================================ | ||
| # Default configurations | ||
| # ================================ | ||
| defaults: | ||
| - _self_ | ||
| - asr: ctc_asr_config # ASR configuration | ||
|
||
| - pnc: punctuation_capitalization_config # Punctuation & capitalization model config | ||
| - itn: inverse_normalization_config # Inverse text normalization config | ||
|
|
||
|
|
||
| # ======================== | ||
| # Confidence estimation | ||
| # ======================== | ||
| confidence: | ||
| exclude_blank: true # Exclude blank tokens when calculating confidence | ||
| aggregation: mean # Aggregation method for confidence across time steps | ||
| method_cfg: | ||
| name: entropy # Confidence estimation method: 'max_prob' or 'entropy' | ||
| entropy_type: tsallis | ||
| alpha: 0.5 | ||
| entropy_norm: exp | ||
|
|
||
| # ======================== | ||
| # Endpointing settings | ||
| # ======================== | ||
| endpointing: | ||
| stop_history_eou: 800 # Time window (ms) for evaluating EoU | ||
| residue_tokens_at_end: 2 # Number of residual tokens used for EoU | ||
|
|
||
|
|
||
| # ======================== | ||
| # Streaming configuration | ||
| # ======================== | ||
| streaming: | ||
| sample_rate: 16000 # Audio sample rate in Hz | ||
| batch_size: 256 # Number of audio frames per batch | ||
| left_padding_size: 1.6 # Left padding duration in seconds | ||
| right_padding_size: 1.6 # Right padding duration in seconds | ||
| chunk_size: 4.8 # Audio chunk size in seconds | ||
| word_boundary_tolerance: 4 # Tolerance for word boundaries | ||
| request_type: feature_buffer # Type of request: frame or feature_buffer | ||
| padding_mode: right # Padding mode: left or right. How to pad frames to match the required buffer length | ||
|
|
||
|
|
||
| # ============================ | ||
| # Text postprocessing settings | ||
| # ============================ | ||
| text_postprocessor: | ||
| force_to_use_pnc_model: false # Force use of BERT based PnC restoration model | ||
| pnc: | ||
| left_padding_search_size: 45 # Look-back window (#words) for punctuation context | ||
| batch_size: 128 # Batch size for PnC model inference | ||
| max_seq_length: 64 # Max sequence length processed at once | ||
| step: 8 # Sliding step size | ||
| margin: 16 # Overlap between windows to ensure smooth transitions | ||
| itn: | ||
| left_padding_size: 4 # Padding size (#spans) for ITN context | ||
| batch_size: 32 # Batch size for ITN inference | ||
| n_jobs: 16 # Number of parallel jobs for ITN processing | ||
|
|
||
|
|
||
| # ======================== | ||
| # Recognizer settings | ||
| # ======================== | ||
| matmul_precision: high # Matrix multiplication precision: highest, high, medium | ||
| log_level: 20 # Logging level: 0 (NOTSET), 10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL) | ||
| recognizer_type: buffered # Recognizer type: buffered, cache_aware | ||
| asr_decoding_type: ctc # Decoding method: ctc or rnnt | ||
|
|
||
|
|
||
| # ======================== | ||
| # Runtime arguments defined at runtime via command line | ||
| # ======================== | ||
| audio_file: null # Path to audio file, directory, or manifest JSON | ||
| output_filename: null # Path to output transcription JSON file | ||
| output_dir: null # Directory to save time-aligned output | ||
| automatic_punctuation: false # Whether to apply punctuation & capitalization | ||
| verbatim_transcripts: true # Whether to apply inverse text normalization | ||
| asr_output_granularity: segment # Output granularity: word or segment | ||
| cache_dir: null # Directory to store cache (e.g., .far files) | ||
| lang: null # Language code for ASR model | ||
| return_tail_result: false # Whether to return the tail labels left in the right padded side of the buffer | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,84 @@ | ||
| # ================================ | ||
| # Default configurations | ||
| # ================================ | ||
| defaults: | ||
| - _self_ | ||
| - asr: rnnt_asr_config # ASR configuration | ||
| - pnc: punctuation_capitalization_config # Punctuation & capitalization model config | ||
| - itn: inverse_normalization_config # Inverse text normalization config | ||
|
|
||
|
|
||
| # ======================== | ||
| # Confidence estimation | ||
| # ======================== | ||
| confidence: | ||
| exclude_blank: true # Exclude blank tokens when calculating confidence | ||
| aggregation: mean # Aggregation method for confidence across time steps | ||
| method_cfg: | ||
| name: entropy # Confidence estimation method: 'max_prob' or 'entropy' | ||
| entropy_type: tsallis | ||
| alpha: 0.5 | ||
| entropy_norm: exp | ||
|
|
||
|
|
||
| # ======================== | ||
| # Endpointing settings | ||
| # ======================== | ||
| endpointing: | ||
| stop_history_eou: 800 # Time window (ms) for evaluating EoU | ||
| residue_tokens_at_end: 2 # Number of residual tokens used for EoU | ||
|
|
||
|
|
||
| # ======================== | ||
| # Streaming configuration | ||
| # ======================== | ||
| streaming: | ||
| sample_rate: 16000 # Audio sample rate in Hz | ||
| batch_size: 256 # Number of audio frames per batch | ||
| left_padding_size: 1.6 # Left padding duration in seconds | ||
| right_padding_size: 1.6 # Right padding duration in seconds | ||
| chunk_size: 4.8 # Audio chunk size in seconds | ||
| word_boundary_tolerance: 4 # Tolerance for word boundaries | ||
| request_type: feature_buffer # Type of request: frame or feature_buffer | ||
| stateful: true # Whether to use stateful processing | ||
| padding_mode: right # Padding mode: left or right. How to pad frames to match the required buffer length | ||
|
|
||
|
|
||
| # ============================ | ||
| # Text postprocessing settings | ||
| # ============================ | ||
| text_postprocessor: | ||
| force_to_use_pnc_model: false # Force use of BERT based PnC restoration model | ||
| pnc: | ||
| left_padding_search_size: 45 # Look-back window (#words) for punctuation context | ||
| batch_size: 128 # Batch size for PnC model inference | ||
| max_seq_length: 64 # Max sequence length processed at once | ||
| step: 8 # Sliding step size | ||
| margin: 16 # Overlap between windows to ensure smooth transitions | ||
| itn: | ||
| left_padding_size: 4 # Padding size (#spans) for ITN context | ||
| batch_size: 32 # Batch size for ITN inference | ||
| n_jobs: 16 # Number of parallel jobs for ITN processing | ||
|
|
||
|
|
||
| # ======================== | ||
| # Recognizer settings | ||
| # ======================== | ||
| matmul_precision: high # Matrix multiplication precision: highest, high, medium | ||
| log_level: 20 # Logging level: 0 (NOTSET), 10 (DEBUG), 20 (INFO), 30 (WARNING), 40 (ERROR), 50 (CRITICAL) | ||
| recognizer_type: buffered # Recognizer type: buffered, cache_aware | ||
| asr_decoding_type: rnnt # Decoding method: ctc or rnnt | ||
|
|
||
|
|
||
| # ======================== | ||
| # Runtime arguments defined at runtime via command line | ||
| # ======================== | ||
| audio_file: null # Path to audio file, directory, or manifest JSON | ||
| output_filename: null # Path to output transcription JSON file | ||
| output_dir: null # Directory to save time-aligned output | ||
| automatic_punctuation: false # Whether to apply punctuation & capitalization | ||
| verbatim_transcripts: true # Whether to apply inverse text normalization | ||
| asr_output_granularity: segment # Output granularity: word or segment | ||
| cache_dir: null # Directory to store cache (e.g., .far files) | ||
| lang: null # Language code for ASR model | ||
| return_tail_result: false # Whether to return the tail labels left in the right padded side of the buffer |
Uh oh!
There was an error while loading. Please reload this page.