|
23 | 23 |
|
24 | 24 | ## To evaluate a model in cache-aware streaming mode on a single audio file: |
25 | 25 |
|
26 | | -python speech_to_text_streaming_infer.py \ |
| 26 | +python speech_to_text_cache_aware_streaming_infer.py \ |
27 | 27 | model_path=asr_model.nemo \ |
28 | 28 | audio_file=audio_file.wav \ |
29 | 29 | compare_vs_offline=true \ |
|
32 | 32 |
|
33 | 33 | ## To evaluate a model in cache-aware streaming mode on a manifest file: |
34 | 34 |
|
35 | | -python speech_to_text_streaming_infer.py \ |
| 35 | +python speech_to_text_cache_aware_streaming_infer.py \ |
36 | 36 | model_path=asr_model.nemo \ |
37 | 37 | dataset_manifest=manifest_file.json \ |
38 | 38 | batch_size=16 \ |
39 | 39 | compare_vs_offline=true \ |
40 | 40 | amp=true \ |
41 | 41 | debug_mode=true |
42 | 42 |
|
| 43 | +## It is also possible to use phrase boosting or external LM with cache-aware models: |
| 44 | +
|
| 45 | +python speech_to_text_cache_aware_streaming_infer.py \ |
| 46 | + model_path=asr_model.nemo \ |
| 47 | + dataset_manifest=manifest_file.json \ |
| 48 | + batch_size=16 \ |
| 49 | + rnnt_decoding.greedy.boosting_tree.key_phrases_file=key_words_list.txt \ |
| 50 | + rnnt_decoding.greedy.boosting_tree_alpha=1.0 \ |
| 51 | + rnnt_decoding.greedy.ngram_lm_model=lm_model.nemo \ |
| 52 | + rnnt_decoding.greedy.ngram_lm_model=0.5 \ |
| 53 | + compare_vs_offline=true \ |
| 54 | + amp=true \ |
| 55 | + debug_mode=true |
| 56 | +
|
43 | 57 | You may drop the 'debug_mode' and 'compare_vs_offline' to speedup the streaming evaluation. |
44 | 58 | If compare_vs_offline is not used, then significantly larger batch_size can be used. |
45 | 59 | Setting `pad_and_drop_preencoded` would perform the caching for all steps including the first step. |
46 | 60 | It may result in slightly different outputs from the sub-sampling module compared to offline mode for some techniques like striding and sw_striding. |
47 | 61 | Enabling it would make it easier to export the model to ONNX. |
48 | 62 |
|
| 63 | +For customization details (phrases list, n-gram LM) see details in the documentation: |
| 64 | +https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/asr_language_modeling_and_customization.html |
| 65 | +
|
49 | 66 | ## Hybrid ASR models |
50 | 67 | For Hybrid ASR models which have two decoders, you may select the decoder by decoder_type DECODER_TYPE, where DECODER_TYPE can be "ctc" or "rnnt". |
51 | 68 | If decoder is not set, then the default decoder would be used which is the RNNT decoder for Hybrid ASR models. |
|
66 | 83 | The following command would simulate cache-aware streaming on a pretrained model from NGC with chunk_size of 100, shift_size of 50 and 2 left chunks as left context. |
67 | 84 | The chunk_size of 100 would be 100*4*10=4000ms for a model with 4x downsampling and 10ms shift in feature extraction. |
68 | 85 |
|
69 | | -python speech_to_text_streaming_infer.py \ |
| 86 | +python speech_to_text_cache_aware_streaming_infer.py \ |
70 | 87 | pretrained_name=stt_en_conformer_ctc_large \ |
71 | 88 | chunk_size=100 \ |
72 | 89 | shift_size=50 \ |
@@ -147,8 +164,9 @@ class TranscriptionConfig: |
147 | 164 | allow_mps: bool = False # allow to select MPS device (Apple Silicon M-series GPU) |
148 | 165 | amp: bool = False |
149 | 166 | amp_dtype: str = "float16" # can be set to "float16" or "bfloat16" when using amp |
| 167 | + # NB: default compute_dtype is float32 since currently cache-aware models do not work with different dtype |
150 | 168 | compute_dtype: Optional[str] = ( |
151 | | - None # "float32", "bfloat16" or "float16"; if None (default): bfloat16 if available else float32 |
| 169 | + "float32" # "float32" (default), "bfloat16" or "float16"; if None: bfloat16 if available else float32 |
152 | 170 | ) |
153 | 171 | matmul_precision: str = "high" # Literal["highest", "high", "medium"] |
154 | 172 |
|
@@ -306,9 +324,9 @@ def main(cfg: TranscriptionConfig): |
306 | 324 | if compute_dtype != torch.float32: |
307 | 325 | # NB: cache-aware models do not currently work with compute_dtype != float32 |
308 | 326 | # since in some layers output is force-casted to float32 |
309 | | - # TODO(vbataev): implement support in future |
| 327 | + # TODO(vbataev): implement support in future; set `compute_dtype` in config to None by default |
310 | 328 | raise NotImplementedError( |
311 | | - f"Compute dtype {cfg.compute_dtype} is not yet supported for cache-aware models, use float32 instead" |
| 329 | + f"Compute dtype {compute_dtype} is not yet supported for cache-aware models, use float32 instead" |
312 | 330 | ) |
313 | 331 |
|
314 | 332 | if (cfg.audio_file is None and cfg.dataset_manifest is None) or ( |
|
0 commit comments