Updates

nune-tadevosyan · nune-tadevosyan · commit fe13830289d2 · 2026-02-10T17:43:24.000+04:00
Signed-off-by: Nune &lt;ntadevosyan@nvidia.com&gt;
diff --git a/nemo/collections/asr/parts/utils/chunking_utils.py b/nemo/collections/asr/parts/utils/chunking_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from difflib import SequenceMatcher
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 
@@ -395,10 +395,14 @@ def ensure_char_token(entry):
     # When return_hypotheses is True, y_sequence contains logits (2D: [T, V]).
     if return_hypotheses and hasattr(hypotheses[0], 'token_sequence') and hypotheses[0].token_sequence is not None:
         merged_hypotheses.y_sequence = torch.cat([hyp.y_sequence for hyp in hypotheses], dim=0)
-        merged_hypotheses.token_sequence = torch.tensor(merged_tokens, dtype=torch.long)
+        merged_hypotheses.token_sequence = torch.tensor(merged_tokens)
     else:
         merged_hypotheses.y_sequence = torch.tensor(merged_tokens)
 
+    merged_alignments = join_alignments(hypotheses)
+    if merged_alignments is not None:
+        merged_hypotheses.alignments = merged_alignments
+
     merged_hypotheses = join_confidence_values(merged_hypotheses, hypotheses)
     merged_hypotheses.text = final_text
     # Set score to minimum of all chunk scores, length to sum of all chunk lengths
@@ -504,6 +508,48 @@ def update_timestamps(hypotheses, tokenizer=None, timestamps_type=None, lang_id=
     return hypotheses
 
 
+def join_alignments(
+    hypotheses: List[Hypothesis],
+) -> Optional[Union[torch.Tensor, List]]:
+    """
+    Concatenate alignments from multiple chunk hypotheses into a single sequence.
+
+    Supports both CTC alignments (1D: list of ints or tensor) and RNNT alignments
+    (2D: list of lists, one inner list per time step). If any hypothesis has no
+    alignments, returns None and the caller should leave merged alignments unset.
+
+    Args:
+        hypotheses: List of Hypothesis objects, each possibly having an alignments attribute.
+
+    Returns:
+        Concatenated alignments (tensor or list), or None if any hypothesis has no alignments.
+    """
+    if not hypotheses:
+        return None
+    if not all(getattr(h, 'alignments', None) is not None for h in hypotheses):
+        return None
+
+    alignments_list = [h.alignments for h in hypotheses]
+
+    # CTC: alignments are a 1D tensor
+    if isinstance(alignments_list[0], torch.Tensor):
+        return torch.cat(alignments_list, dim=0)
+
+    # RNNT: alignments are list of lists (one list per time step T)
+    first_nonempty = next((a for a in alignments_list if len(a) > 0), None)
+    if first_nonempty is not None and isinstance(first_nonempty[0], (list, tuple)):
+        result = []
+        for a in alignments_list:
+            result.extend(a)
+        return result
+
+    # CTC: alignments are a flat list of ints
+    result = []
+    for a in alignments_list:
+        result.extend(a.tolist() if isinstance(a, torch.Tensor) else a)
+    return result
+
+
 def join_confidence_values(merged_hypothesis, hypotheses):
     """
     Concatenate confidence values from multiple hypotheses into a single sequence.
@@ -1116,6 +1162,11 @@ def merge_hypotheses_of_same_audio(
         if valid_y_sequences:
             merged_hypothesis.y_sequence = torch.cat(valid_y_sequences)
 
+    # Merge alignments from all hypotheses (CTC: 1D tensor/list; RNNT: list of lists)
+    merged_alignments = join_alignments(hypotheses_list)
+    if merged_alignments is not None:
+        merged_hypothesis.alignments = merged_alignments
+
     # Merge confidence values from all hypotheses
     merged_hypothesis = join_confidence_values(merged_hypothesis, hypotheses_list)
     # Set score to minimum of all chunk scores, length to sum of all chunk lengths
diff --git a/tests/collections/asr/mixins/test_transcription.py b/tests/collections/asr/mixins/test_transcription.py
@@ -321,7 +321,7 @@ def test_transcribe_return_hypothesis(self, test_data_dir, fast_conformer_ctc_mo
 
         # Audio file test
         #setting enable_chunking False for alignment check
-        outputs = fast_conformer_ctc_model.transcribe(audio_file, batch_size=1, return_hypotheses=True, enable_chunking=False) 
+        outputs = fast_conformer_ctc_model.transcribe(audio_file, batch_size=1, return_hypotheses=True) 
         assert len(outputs) == 1
         assert isinstance(outputs[0], Hypothesis)
 
diff --git a/tests/collections/asr/test_asr_context_biasing.py b/tests/collections/asr/test_asr_context_biasing.py
@@ -59,7 +59,7 @@ def test_run_word_spotter(self, test_data_dir, conformer_ctc_bpe_model):
         target_text = "nineteen"
         target_tokenization = asr_model.tokenizer.text_to_ids(target_text)
         ctc_logprobs = (
-            asr_model.transcribe([audio_file_path], batch_size=1, return_hypotheses=True, enable_chunking=False)[0].alignments.cpu().numpy()
+            asr_model.transcribe([audio_file_path], batch_size=1, return_hypotheses=True)[0].alignments.cpu().numpy()
         )
         context_biasing_list = [[target_text, [target_tokenization]]]
         context_graph = context_biasing.ContextGraphCTC(blank_id=asr_model.decoding.blank_id)

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ def test_run_word_spotter(self, test_data_dir, conformer_ctc_bpe_model):`
`59`	`59`	`target_text = "nineteen"`
`60`	`60`	`target_tokenization = asr_model.tokenizer.text_to_ids(target_text)`
`61`	`61`	`ctc_logprobs = (`
`62`		`- asr_model.transcribe([audio_file_path], batch_size=1, return_hypotheses=True, enable_chunking=False)[0].alignments.cpu().numpy()`
	`62`	`+ asr_model.transcribe([audio_file_path], batch_size=1, return_hypotheses=True)[0].alignments.cpu().numpy()`
`63`	`63`	`)`
`64`	`64`	`context_biasing_list = [[target_text, [target_tokenization]]]`
`65`	`65`	`context_graph = context_biasing.ContextGraphCTC(blank_id=asr_model.decoding.blank_id)`