From 67e196ed1556cc70605e75943af9d3551e5fb74c Mon Sep 17 00:00:00 2001
From: Miguel del Rio <miguel.delrio@rev.com>
Date: Thu, 10 Apr 2025 13:00:21 -0400
Subject: [PATCH 1/3] Hugginface pipeline run -- missing model files

---
 examples/huggingface/reverb_config.py    | 115 ++++++++++++++++++++
 examples/huggingface/reverb_hf.py        |  98 +++++++++++++++++
 examples/huggingface/reverb_processor.py | 132 +++++++++++++++++++++++
 examples/huggingface/transcribe.py       |  46 ++++++++
 4 files changed, 391 insertions(+)
 create mode 100644 examples/huggingface/reverb_config.py
 create mode 100644 examples/huggingface/reverb_hf.py
 create mode 100644 examples/huggingface/reverb_processor.py
 create mode 100644 examples/huggingface/transcribe.py

diff --git a/examples/huggingface/reverb_config.py b/examples/huggingface/reverb_config.py
new file mode 100644
index 0000000..5f04508
--- /dev/null
+++ b/examples/huggingface/reverb_config.py
@@ -0,0 +1,115 @@
+# Following https://huggingface.co/docs/transformers/en/custom_models
+import math
+from typing import Dict, List, Optional
+from transformers import PretrainedConfig
+import numpy as np
+
+
+def cmvn(means: List[float], variance: List[float], count: int):
+    """ Calculate cmvn from stats
+
+    Returns:
+        a numpy array of [means, vars]
+    """
+    for i in range(len(means)):
+        means[i] /= count
+        variance[i] = variance[i] / count - means[i] * means[i]
+        if variance[i] < 1.0e-20:
+            variance[i] = 1.0e-20
+        variance[i] = 1.0 / math.sqrt(variance[i])
+    # cmvn = np.array([means, variance])
+    return [means, variance]
+
+
+class ReverbConfig(PretrainedConfig):
+    # Not sure what to put but also not reqruied: model_type = "encoderdecoder"
+    model_type = "reverb_asr"
+    def __init__(
+        self,
+        input_dim: int = 80,
+        output_dim: int = 10001,
+        cmvn_mean_stat : List[float] = [33596438528.0, 35418329088.0, 39182106624.0, 41983324160.0, 44419112960.0, 46015381504.0, 46934564864.0, 47058870272.0, 47288012800.0, 47522979840.0, 48491438080.0, 49308729344.0, 50230493184.0, 50796900352.0, 51020386304.0, 51297456128.0, 51333586944.0, 51126181888.0, 51455569920.0, 50636410880.0, 49947033600.0, 50365546496.0, 49383075840.0, 49540546560.0, 49066065920.0, 49236889600.0, 48820707328.0, 49071112192.0, 48968024064.0, 49024458752.0, 49202397184.0, 49374433280.0, 49620660224.0, 49947111424.0, 50326310912.0, 50717818880.0, 51046891520.0, 51345678336.0, 51655733248.0, 51505459200.0, 51813666816.0, 51577262080.0, 51776524288.0, 51754237952.0, 51918598144.0, 52158758912.0, 52405276672.0, 52596776960.0, 52639731712.0, 52631220224.0, 52443103232.0, 52315619328.0, 52219695104.0, 52178399232.0, 52083040256.0, 52064792576.0, 51980918784.0, 51824164864.0, 51550973952.0, 51002216448.0, 50422747136.0, 49847754752.0, 49474338816.0, 48997863424.0, 48617009152.0, 48309174272.0, 48084140032.0, 48095608832.0, 47965765632.0, 47909335040.0, 47780065280.0, 47762370560.0, 47757099008.0, 47731314688.0, 47574110208.0, 47336361984.0, 47009054720.0, 46283513856.0, 44821860352.0, 42771775488.0],
+        cmvn_var_stat: List[float] = [360475131904.0, 401487724544.0, 484368646144.0, 548414357504.0, 608912080896.0, 651613241344.0, 678013698048.0, 683624693760.0, 689524047872.0, 695375822848.0, 722376851456.0, 746773872640.0, 774244204544.0, 791678353408.0, 798920015872.0, 807307444224.0, 808713453568.0, 802957754368.0, 812319899648.0, 788076953600.0, 767619497984.0, 777970712576.0, 748566544384.0, 751065628672.0, 736340869120.0, 739872473088.0, 727466704896.0, 734006083584.0, 731017904128.0, 732582576128.0, 737590444032.0, 742469861376.0, 749455671296.0, 758746972160.0, 769666121728.0, 781107331072.0, 790730506240.0, 799342002176.0, 808164917248.0, 803454713856.0, 812040585216.0, 804632395776.0, 809866821632.0, 808861499392.0, 813548044288.0, 820701954048.0, 828343779328.0, 834335604736.0, 835754590208.0, 835251011584.0, 829192929280.0, 824705744896.0, 821224734720.0, 819399753728.0, 816182853632.0, 815243788288.0, 812578177024.0, 807846281216.0, 799796035584.0, 784661544960.0, 770915631104.0, 756696285184.0, 746462183424.0, 734193254400.0, 724980072448.0, 717529612288.0, 711156563968.0, 710358204416.0, 706386919424.0, 704228884480.0, 700537110528.0, 699519008768.0, 699025129472.0, 698035535872.0, 693109391360.0, 686047887360.0, 676213948416.0, 655917645824.0, 616676458496.0, 563932168192.0],
+        cmvn_frame_num: int = 3519342927,
+        encoder: str = "conformer",
+        encoder_activation_type: str = "swish",
+        encoder_attention_dropout_rate: float = 0.1,
+        encoder_attention_heads: int = 8,
+        encoder_causal: bool = True,
+        encoder_cnn_module_kernel: int = 31,
+        encoder_cnn_module_norm: str = "layer_norm",
+        encoder_dropout_rate: float = 0.1,
+        encoder_input_layer: str = "conv2d",
+        encoder_linear_units: int = 2048,
+        encoder_normalize_before: bool = True,
+        encoder_num_blocks: int = 18,
+        encoder_num_langs: int = 2,
+        encoder_output_size: int = 640,
+        encoder_pos_enc_layer_type: str = "rel_pos",
+        encoder_positional_dropout_rate: float = 0.1,
+        encoder_selfattention_layer_type: str = "rel_selfattn",
+        encoder_use_cnn_module: bool = True,
+        encoder_use_dynamic_chunk: bool = True,
+        decoder: str = "lslbitransformer",
+        decoder_attention_heads: int = 8,
+        decoder_dropout_rate: float = 0.1,
+        decoder_linear_units: int = 2048,
+        decoder_num_blocks: int = 6,
+        decoder_num_langs: int = 2,
+        decoder_positional_dropout_rate: float = 0.1,
+        decoder_r_num_blocks: int = 6,
+        decoder_self_attention_dropout_rate: float = 0.1,
+        decoder_src_attention_dropout_rate: float = 0.1,
+        ctc_blank_id: int = 0,
+        ctc_weight: float = 0.3,
+        lsm_weight: float = 0.1,
+        reverse_weight: float = 0.3,
+        special_tokens: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ):
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.encoder = encoder
+        self.encoder_activation_type = encoder_activation_type
+        self.encoder_attention_dropout_rate = encoder_attention_dropout_rate
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_causal = encoder_causal
+        self.encoder_cnn_module_kernel = encoder_cnn_module_kernel
+        self.encoder_cnn_module_norm = encoder_cnn_module_norm
+        self.encoder_dropout_rate = encoder_dropout_rate
+        self.encoder_input_layer = encoder_input_layer
+        self.encoder_linear_units = encoder_linear_units
+        self.encoder_normalize_before = encoder_normalize_before
+        self.encoder_num_blocks = encoder_num_blocks
+        self.encoder_num_langs = encoder_num_langs
+        self.encoder_output_size = encoder_output_size
+        self.encoder_pos_enc_layer_type = encoder_pos_enc_layer_type
+        self.encoder_positional_dropout_rate = encoder_positional_dropout_rate
+        self.encoder_selfattention_layer_type = encoder_selfattention_layer_type
+        self.encoder_use_cnn_module = encoder_use_cnn_module
+        self.encoder_use_dynamic_chunk = encoder_use_dynamic_chunk
+        self.decoder = decoder
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_dropout_rate = decoder_dropout_rate
+        self.decoder_linear_units = decoder_linear_units
+        self.decoder_num_blocks = decoder_num_blocks
+        self.decoder_num_langs = decoder_num_langs
+        self.decoder_positional_dropout_rate = decoder_positional_dropout_rate
+        self.decoder_r_num_blocks = decoder_r_num_blocks
+        self.decoder_self_attention_dropout_rate = decoder_self_attention_dropout_rate
+        self.decoder_src_attention_dropout_rate = decoder_src_attention_dropout_rate
+        self.ctc_blank_id = ctc_blank_id
+        self.ctc_weight = ctc_weight
+        self.lsm_weight = lsm_weight
+        self.reverse_weight = reverse_weight
+        if special_tokens is None:
+            special_tokens = {
+                "<blank>": 0,
+                "<sos>": 2,
+                "<eos>": 2,
+                "<unk>": 1,
+            }
+        self.special_tokens = special_tokens
+        self.cmvn_mean, self.cmvn_istd = cmvn(cmvn_mean_stat, cmvn_var_stat, cmvn_frame_num)
+        self.inputs_to_logits_ratio = 1
+        super().__init__(**kwargs)
diff --git a/examples/huggingface/reverb_hf.py b/examples/huggingface/reverb_hf.py
new file mode 100644
index 0000000..d1908c0
--- /dev/null
+++ b/examples/huggingface/reverb_hf.py
@@ -0,0 +1,98 @@
+# Following https://huggingface.co/docs/transformers/en/custom_models
+
+from typing import List, Optional, Tuple, Union
+import torch
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import Seq2SeqLMOutput
+from wenet.transformer.asr_model import ASRModel
+from wenet.transformer.cmvn import GlobalCMVN
+from wenet.transformer.ctc import CTC
+from wenet.transformer.decoder import LanguageSpecificBiTransformerDecoder
+from wenet.transformer.encoder import ConformerEncoder
+from reverb_config import ReverbConfig
+
+class ReverbModel(PreTrainedModel):
+    config_class = ReverbConfig
+    main_input_name = "input_features"
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        global_cmvn = GlobalCMVN(
+            torch.Tensor(config.cmvn_mean),
+            torch.Tensor(config.cmvn_istd),
+        )
+        encoder = ConformerEncoder(
+            config.input_dim,
+            global_cmvn=global_cmvn,
+            activation_type=config.encoder_activation_type,
+            attention_dropout_rate=config.encoder_attention_dropout_rate,
+            attention_heads=config.encoder_attention_heads,
+            causal=config.encoder_causal,
+            cnn_module_kernel=config.encoder_cnn_module_kernel,
+            cnn_module_norm=config.encoder_cnn_module_norm,
+            dropout_rate=config.encoder_dropout_rate,
+            input_layer=config.encoder_input_layer,
+            linear_units=config.encoder_linear_units,
+            normalize_before=config.encoder_normalize_before,
+            num_blocks=config.encoder_num_blocks,
+            num_langs=config.encoder_num_langs,
+            output_size=config.encoder_output_size,
+            pos_enc_layer_type=config.encoder_pos_enc_layer_type,
+            positional_dropout_rate=config.encoder_positional_dropout_rate,
+            selfattention_layer_type=config.encoder_selfattention_layer_type,
+            use_cnn_module=config.encoder_use_cnn_module,
+            use_dynamic_chunk=config.encoder_use_dynamic_chunk,
+        )
+
+        decoder = LanguageSpecificBiTransformerDecoder(
+            config.output_dim,
+            config.encoder_output_size,
+            attention_heads=config.decoder_attention_heads,
+            dropout_rate=config.decoder_dropout_rate,
+            linear_units=config.decoder_linear_units,
+            num_blocks=config.decoder_num_blocks,
+            num_langs=config.decoder_num_langs,
+            positional_dropout_rate=config.decoder_positional_dropout_rate,
+            r_num_blocks=config.decoder_r_num_blocks,
+            self_attention_dropout_rate=config.decoder_self_attention_dropout_rate,
+            src_attention_dropout_rate=config.decoder_src_attention_dropout_rate,
+        )
+
+        ctc = CTC(
+            config.output_dim,
+            config.encoder_output_size,
+            config.ctc_blank_id,
+        )
+
+        self.model = ASRModel(
+            vocab_size=config.output_dim,
+            encoder=encoder,
+            decoder=decoder,
+            ctc=ctc,
+            special_tokens=config.special_tokens,
+            ctc_weight=config.ctc_weight,
+            lsm_weight=config.lsm_weight,
+            reverse_weight=config.reverse_weight,
+        )
+        self.model.lsl_enc = True
+        self.model.lsl_dec = True
+
+    def forward(
+        self,
+        input_features=None,
+        feats_lengths=None,
+        labels=None,
+        labels_lengths=None,
+        **kwargs,
+    ):
+        output = self.model.hf_forward(
+            input_features,
+            feats_lengths=feats_lengths,
+            labels=labels,
+            labels_lengths=labels_lengths,
+        )
+        return Seq2SeqLMOutput(
+            logits=output['ctc_probs'],
+            loss=output['loss'],
+        )
diff --git a/examples/huggingface/reverb_processor.py b/examples/huggingface/reverb_processor.py
new file mode 100644
index 0000000..c3aff5f
--- /dev/null
+++ b/examples/huggingface/reverb_processor.py
@@ -0,0 +1,132 @@
+import json
+from typing import List, Optional, Union
+import numpy as np
+import sentencepiece as spm
+import torch
+import torchaudio
+from torchaudio.compliance import kaldi
+from tqdm import tqdm
+from transformers import BatchFeature, PreTrainedTokenizer, ProcessorMixin, SequenceFeatureExtractor
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ReverbFeatureExtractor(SequenceFeatureExtractor):
+    model_input_names = ["input_features"]
+    def __init__(
+        self,
+        feature_size=80,
+        sampling_rate=16000,
+        frame_length=25,
+        frame_shift=10,
+        chunk_length=15,
+        padding_value=0.0,
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            return_attention_mask=False,
+            **kwargs,
+        )
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.chunk_length = chunk_length
+        self.max_chunk_size = 2051
+        self._processor_class = "CTCWithLM"
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        device: Optional[str] = "cpu",
+        sampling_rate: Optional[int] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                ValueError(
+                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
+                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
+                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
+                    " Attempting a conversion."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray([speech], dtype=np.float32) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        if not is_batched:
+            raw_speech = [np.asarray([raw_speech])]
+
+        fbank_speech, feats_lengths = [], []
+        for waveform in raw_speech:
+            fbank_speech.append(
+                kaldi.fbank(
+                    torch.tensor(waveform),
+                    num_mel_bins=self.feature_size,
+                    frame_length=self.frame_length,
+                    frame_shift=self.frame_shift,
+                    dither=0.0,
+                    energy_floor=0.0,
+                    sample_frequency=self.sampling_rate,
+                )
+            )
+            feats_lengths.append(fbank_speech[-1].shape[0])
+        fbank_speech = BatchFeature({
+            "input_features": fbank_speech,
+            "feats_lengths": feats_lengths,
+        })
+        padded = self.pad(
+            fbank_speech,
+            padding="max_length",
+            max_length=self.max_chunk_size,
+        )
+        return padded
+
+
+class ReverbTokenizer(PreTrainedTokenizer):
+    def __init__(
+        self,
+        model: str,
+        #units: str,
+        **kwargs,
+    ):
+        self.tokenizer = spm.SentencePieceProcessor(model)
+        """self.units = dict()
+        with open(units, 'r') as units_file:
+            for line in tqdm(units_file.readlines()):
+                    token, id = line.split()
+                    self.units[int(id)] = token.replace('▁', ' ')"""
+
+
+    def encode(
+        self,
+        text,
+        **kwargs
+    ):
+        return self.tokenizer.encode(text)
+
+    def decode(
+        self,
+        token_ids,
+        **kwargs,
+    ):
+        return self.tokenizer.decode(token_ids[token_ids.nonzero()[0]].tolist())
diff --git a/examples/huggingface/transcribe.py b/examples/huggingface/transcribe.py
new file mode 100644
index 0000000..458c936
--- /dev/null
+++ b/examples/huggingface/transcribe.py
@@ -0,0 +1,46 @@
+import numpy as np
+from pyctcdecode import build_ctcdecoder
+import torch
+import torchaudio
+from transformers import pipeline
+from transformers import AutoConfig, AutoModelForSpeechSeq2Seq
+from reverb_hf import ReverbModel
+from reverb_config import ReverbConfig
+from reverb_processor import ReverbFeatureExtractor, ReverbTokenizer
+
+
+AutoConfig.register("reverb_asr", ReverbConfig)
+AutoModelForSpeechSeq2Seq.register(ReverbConfig, ReverbModel)
+feature_extractor = ReverbFeatureExtractor(return_tensors='pt')
+tokenizer = ReverbTokenizer(
+    "hf-reverb/tk.model",
+)
+decoder_ids = []
+with open("hf-reverb/tk.units.txt", 'r') as units_file:
+    for line in units_file:
+        token = line.split()[0]
+        if len(token) == 0:
+            continue
+        if token == '<blank>':
+            token = ''
+        decoder_ids.append(token)
+decoder = build_ctcdecoder(decoder_ids)
+
+transcribe = pipeline(
+    "automatic-speech-recognition",
+    model="hf-reverb",
+    feature_extractor=feature_extractor,
+    tokenizer=tokenizer,
+    framework='pt',
+    device='cpu', #crucial
+    decoder=decoder,
+    decoder_kwargs={"beam_width": 8, "token_min_logp": -10}
+)
+AUDIO_PATH = ""
+waveform, sample_rate = torchaudio.load(AUDIO_PATH, normalize=False)
+#print(waveform)
+waveform = np.array(waveform.to(torch.float).reshape(-1))
+
+chunk_size_samples = feature_extractor.chunk_length * sample_rate
+for idx in range(0,len(waveform),chunk_size_samples):
+    print(transcribe(waveform[idx: idx+chunk_size_samples])['text'])

From 8023c54a5c2960e2a17465083b40a2fa85777437 Mon Sep 17 00:00:00 2001
From: Jan Profant <jan.profant@phonexia.com>
Date: Tue, 22 Apr 2025 19:43:51 +0200
Subject: [PATCH 2/3] first version

---
 examples/huggingface/reverb_config.py | 147 +++++++++++---------------
 1 file changed, 61 insertions(+), 86 deletions(-)

diff --git a/examples/huggingface/reverb_config.py b/examples/huggingface/reverb_config.py
index 5f04508..f7e1553 100644
--- a/examples/huggingface/reverb_config.py
+++ b/examples/huggingface/reverb_config.py
@@ -3,6 +3,8 @@
 from typing import Dict, List, Optional
 from transformers import PretrainedConfig
 import numpy as np
+import yaml
+from pyctcdecode import build_ctcdecoder
 
 
 def cmvn(means: List[float], variance: List[float], count: int):
@@ -17,99 +19,72 @@ def cmvn(means: List[float], variance: List[float], count: int):
         if variance[i] < 1.0e-20:
             variance[i] = 1.0e-20
         variance[i] = 1.0 / math.sqrt(variance[i])
-    # cmvn = np.array([means, variance])
     return [means, variance]
 
 
 class ReverbConfig(PretrainedConfig):
-    # Not sure what to put but also not reqruied: model_type = "encoderdecoder"
     model_type = "reverb_asr"
-    def __init__(
-        self,
-        input_dim: int = 80,
-        output_dim: int = 10001,
-        cmvn_mean_stat : List[float] = [33596438528.0, 35418329088.0, 39182106624.0, 41983324160.0, 44419112960.0, 46015381504.0, 46934564864.0, 47058870272.0, 47288012800.0, 47522979840.0, 48491438080.0, 49308729344.0, 50230493184.0, 50796900352.0, 51020386304.0, 51297456128.0, 51333586944.0, 51126181888.0, 51455569920.0, 50636410880.0, 49947033600.0, 50365546496.0, 49383075840.0, 49540546560.0, 49066065920.0, 49236889600.0, 48820707328.0, 49071112192.0, 48968024064.0, 49024458752.0, 49202397184.0, 49374433280.0, 49620660224.0, 49947111424.0, 50326310912.0, 50717818880.0, 51046891520.0, 51345678336.0, 51655733248.0, 51505459200.0, 51813666816.0, 51577262080.0, 51776524288.0, 51754237952.0, 51918598144.0, 52158758912.0, 52405276672.0, 52596776960.0, 52639731712.0, 52631220224.0, 52443103232.0, 52315619328.0, 52219695104.0, 52178399232.0, 52083040256.0, 52064792576.0, 51980918784.0, 51824164864.0, 51550973952.0, 51002216448.0, 50422747136.0, 49847754752.0, 49474338816.0, 48997863424.0, 48617009152.0, 48309174272.0, 48084140032.0, 48095608832.0, 47965765632.0, 47909335040.0, 47780065280.0, 47762370560.0, 47757099008.0, 47731314688.0, 47574110208.0, 47336361984.0, 47009054720.0, 46283513856.0, 44821860352.0, 42771775488.0],
-        cmvn_var_stat: List[float] = [360475131904.0, 401487724544.0, 484368646144.0, 548414357504.0, 608912080896.0, 651613241344.0, 678013698048.0, 683624693760.0, 689524047872.0, 695375822848.0, 722376851456.0, 746773872640.0, 774244204544.0, 791678353408.0, 798920015872.0, 807307444224.0, 808713453568.0, 802957754368.0, 812319899648.0, 788076953600.0, 767619497984.0, 777970712576.0, 748566544384.0, 751065628672.0, 736340869120.0, 739872473088.0, 727466704896.0, 734006083584.0, 731017904128.0, 732582576128.0, 737590444032.0, 742469861376.0, 749455671296.0, 758746972160.0, 769666121728.0, 781107331072.0, 790730506240.0, 799342002176.0, 808164917248.0, 803454713856.0, 812040585216.0, 804632395776.0, 809866821632.0, 808861499392.0, 813548044288.0, 820701954048.0, 828343779328.0, 834335604736.0, 835754590208.0, 835251011584.0, 829192929280.0, 824705744896.0, 821224734720.0, 819399753728.0, 816182853632.0, 815243788288.0, 812578177024.0, 807846281216.0, 799796035584.0, 784661544960.0, 770915631104.0, 756696285184.0, 746462183424.0, 734193254400.0, 724980072448.0, 717529612288.0, 711156563968.0, 710358204416.0, 706386919424.0, 704228884480.0, 700537110528.0, 699519008768.0, 699025129472.0, 698035535872.0, 693109391360.0, 686047887360.0, 676213948416.0, 655917645824.0, 616676458496.0, 563932168192.0],
-        cmvn_frame_num: int = 3519342927,
-        encoder: str = "conformer",
-        encoder_activation_type: str = "swish",
-        encoder_attention_dropout_rate: float = 0.1,
-        encoder_attention_heads: int = 8,
-        encoder_causal: bool = True,
-        encoder_cnn_module_kernel: int = 31,
-        encoder_cnn_module_norm: str = "layer_norm",
-        encoder_dropout_rate: float = 0.1,
-        encoder_input_layer: str = "conv2d",
-        encoder_linear_units: int = 2048,
-        encoder_normalize_before: bool = True,
-        encoder_num_blocks: int = 18,
-        encoder_num_langs: int = 2,
-        encoder_output_size: int = 640,
-        encoder_pos_enc_layer_type: str = "rel_pos",
-        encoder_positional_dropout_rate: float = 0.1,
-        encoder_selfattention_layer_type: str = "rel_selfattn",
-        encoder_use_cnn_module: bool = True,
-        encoder_use_dynamic_chunk: bool = True,
-        decoder: str = "lslbitransformer",
-        decoder_attention_heads: int = 8,
-        decoder_dropout_rate: float = 0.1,
-        decoder_linear_units: int = 2048,
-        decoder_num_blocks: int = 6,
-        decoder_num_langs: int = 2,
-        decoder_positional_dropout_rate: float = 0.1,
-        decoder_r_num_blocks: int = 6,
-        decoder_self_attention_dropout_rate: float = 0.1,
-        decoder_src_attention_dropout_rate: float = 0.1,
-        ctc_blank_id: int = 0,
-        ctc_weight: float = 0.3,
-        lsm_weight: float = 0.1,
-        reverse_weight: float = 0.3,
-        special_tokens: Optional[Dict[str, int]] = None,
-        **kwargs,
-    ):
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        self.encoder = encoder
-        self.encoder_activation_type = encoder_activation_type
-        self.encoder_attention_dropout_rate = encoder_attention_dropout_rate
-        self.encoder_attention_heads = encoder_attention_heads
-        self.encoder_causal = encoder_causal
-        self.encoder_cnn_module_kernel = encoder_cnn_module_kernel
-        self.encoder_cnn_module_norm = encoder_cnn_module_norm
-        self.encoder_dropout_rate = encoder_dropout_rate
-        self.encoder_input_layer = encoder_input_layer
-        self.encoder_linear_units = encoder_linear_units
-        self.encoder_normalize_before = encoder_normalize_before
-        self.encoder_num_blocks = encoder_num_blocks
-        self.encoder_num_langs = encoder_num_langs
-        self.encoder_output_size = encoder_output_size
-        self.encoder_pos_enc_layer_type = encoder_pos_enc_layer_type
-        self.encoder_positional_dropout_rate = encoder_positional_dropout_rate
-        self.encoder_selfattention_layer_type = encoder_selfattention_layer_type
-        self.encoder_use_cnn_module = encoder_use_cnn_module
-        self.encoder_use_dynamic_chunk = encoder_use_dynamic_chunk
-        self.decoder = decoder
-        self.decoder_attention_heads = decoder_attention_heads
-        self.decoder_dropout_rate = decoder_dropout_rate
-        self.decoder_linear_units = decoder_linear_units
-        self.decoder_num_blocks = decoder_num_blocks
-        self.decoder_num_langs = decoder_num_langs
-        self.decoder_positional_dropout_rate = decoder_positional_dropout_rate
-        self.decoder_r_num_blocks = decoder_r_num_blocks
-        self.decoder_self_attention_dropout_rate = decoder_self_attention_dropout_rate
-        self.decoder_src_attention_dropout_rate = decoder_src_attention_dropout_rate
-        self.ctc_blank_id = ctc_blank_id
-        self.ctc_weight = ctc_weight
-        self.lsm_weight = lsm_weight
-        self.reverse_weight = reverse_weight
-        if special_tokens is None:
-            special_tokens = {
+    
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        
+        # Set default special tokens if not provided
+        if not hasattr(self, 'special_tokens'):
+            self.special_tokens = {
                 "<blank>": 0,
                 "<sos>": 2,
                 "<eos>": 2,
                 "<unk>": 1,
             }
-        self.special_tokens = special_tokens
-        self.cmvn_mean, self.cmvn_istd = cmvn(cmvn_mean_stat, cmvn_var_stat, cmvn_frame_num)
-        self.inputs_to_logits_ratio = 1
-        super().__init__(**kwargs)
+            
+        # Calculate CMVN if the required stats are provided
+        if hasattr(self, 'cmvn_mean_stat') and hasattr(self, 'cmvn_var_stat') and hasattr(self, 'cmvn_frame_num'):
+            self.cmvn_mean, self.cmvn_istd = cmvn(
+                self.cmvn_mean_stat, 
+                self.cmvn_var_stat, 
+                self.cmvn_frame_num
+            )
+            
+        # Set default ratio if not provided
+        if not hasattr(self, 'inputs_to_logits_ratio'):
+            self.inputs_to_logits_ratio = 1
+            
+        # Tokenizer configuration
+        if not hasattr(self, 'tokenizer_path'):
+            self.tokenizer_path = "path/to/tokenizer.model"
+        if not hasattr(self, 'units_path'):
+            self.units_path = "path/to/units.txt"
+        if not hasattr(self, 'decoder_beam_width'):
+            self.decoder_beam_width = 8
+        if not hasattr(self, 'decoder_token_min_logp'):
+            self.decoder_token_min_logp = -10
+            
+        # Load units and build decoder
+        self._load_units_and_build_decoder()
+        
+    def _load_units_and_build_decoder(self):
+        """Load units from file and build the CTC decoder."""
+        decoder_ids = []
+        with open(self.units_path, 'r') as units_file:
+            for line in units_file:
+                token = line.split()[0]
+                if len(token) == 0:
+                    continue
+                if token == '<blank>':
+                    token = ''
+                decoder_ids.append(token)
+        self.decoder = build_ctcdecoder(decoder_ids)
+
+    @classmethod
+    def from_yaml_file(cls, yaml_file: str) -> "ReverbConfig":
+        """Load a ReverbConfig from a YAML file.
+        
+        Args:
+            yaml_file: Path to the YAML file containing the configuration
+            
+        Returns:
+            A ReverbConfig instance loaded from the file
+        """
+        with open(yaml_file, 'r') as f:
+            config_dict = yaml.safe_load(f)
+        return cls(**config_dict)
\ No newline at end of file

From a06252b07f9aad185f9e1e8cb1f686fbba3248a8 Mon Sep 17 00:00:00 2001
From: Jan Profant <jan.profant@phonexia.com>
Date: Tue, 22 Apr 2025 19:44:55 +0200
Subject: [PATCH 3/3] wip

---
 examples/huggingface/reverb_config.yaml | 48 +++++++++++++++++++++++++
 examples/huggingface/transcribe.py      | 43 +++++++++++++---------
 2 files changed, 74 insertions(+), 17 deletions(-)
 create mode 100644 examples/huggingface/reverb_config.yaml

diff --git a/examples/huggingface/reverb_config.yaml b/examples/huggingface/reverb_config.yaml
new file mode 100644
index 0000000..6003377
--- /dev/null
+++ b/examples/huggingface/reverb_config.yaml
@@ -0,0 +1,48 @@
+model_type: reverb_asr
+input_dim: 80
+output_dim: 10001
+cmvn_mean_stat: [33596438528.0, 35418329088.0, 39182106624.0, 41983324160.0, 44419112960.0, 46015381504.0, 46934564864.0, 47058870272.0, 47288012800.0, 47522979840.0, 48491438080.0, 49308729344.0, 50230493184.0, 50796900352.0, 51020386304.0, 51297456128.0, 51333586944.0, 51126181888.0, 51455569920.0, 50636410880.0, 49947033600.0, 50365546496.0, 49383075840.0, 49540546560.0, 49066065920.0, 49236889600.0, 48820707328.0, 49071112192.0, 48968024064.0, 49024458752.0, 49202397184.0, 49374433280.0, 49620660224.0, 49947111424.0, 50326310912.0, 50717818880.0, 51046891520.0, 51345678336.0, 51655733248.0, 51505459200.0, 51813666816.0, 51577262080.0, 51776524288.0, 51754237952.0, 51918598144.0, 52158758912.0, 52405276672.0, 52596776960.0, 52639731712.0, 52631220224.0, 52443103232.0, 52315619328.0, 52219695104.0, 52178399232.0, 52083040256.0, 52064792576.0, 51980918784.0, 51824164864.0, 51550973952.0, 51002216448.0, 50422747136.0, 49847754752.0, 49474338816.0, 48997863424.0, 48617009152.0, 48309174272.0, 48084140032.0, 48095608832.0, 47965765632.0, 47909335040.0, 47780065280.0, 47762370560.0, 47757099008.0, 47731314688.0, 47574110208.0, 47336361984.0, 47009054720.0, 46283513856.0, 44821860352.0, 42771775488.0]
+cmvn_var_stat: [360475131904.0, 401487724544.0, 484368646144.0, 548414357504.0, 608912080896.0, 651613241344.0, 678013698048.0, 683624693760.0, 689524047872.0, 695375822848.0, 722376851456.0, 746773872640.0, 774244204544.0, 791678353408.0, 798920015872.0, 807307444224.0, 808713453568.0, 802957754368.0, 812319899648.0, 788076953600.0, 767619497984.0, 777970712576.0, 748566544384.0, 751065628672.0, 736340869120.0, 739872473088.0, 727466704896.0, 734006083584.0, 731017904128.0, 732582576128.0, 737590444032.0, 742469861376.0, 749455671296.0, 758746972160.0, 769666121728.0, 781107331072.0, 790730506240.0, 799342002176.0, 808164917248.0, 803454713856.0, 812040585216.0, 804632395776.0, 809866821632.0, 808861499392.0, 813548044288.0, 820701954048.0, 828343779328.0, 834335604736.0, 835754590208.0, 835251011584.0, 829192929280.0, 824705744896.0, 821224734720.0, 819399753728.0, 816182853632.0, 815243788288.0, 812578177024.0, 807846281216.0, 799796035584.0, 784661544960.0, 770915631104.0, 756696285184.0, 746462183424.0, 734193254400.0, 724980072448.0, 717529612288.0, 711156563968.0, 710358204416.0, 706386919424.0, 704228884480.0, 700537110528.0, 699519008768.0, 699025129472.0, 698035535872.0, 693109391360.0, 686047887360.0, 676213948416.0, 655917645824.0, 616676458496.0, 563932168192.0]
+cmvn_frame_num: 3519342927
+encoder: conformer
+encoder_activation_type: swish
+encoder_attention_dropout_rate: 0.1
+encoder_attention_heads: 8
+encoder_causal: true
+encoder_cnn_module_kernel: 31
+encoder_cnn_module_norm: layer_norm
+encoder_dropout_rate: 0.1
+encoder_input_layer: conv2d
+encoder_linear_units: 2048
+encoder_normalize_before: true
+encoder_num_blocks: 18
+encoder_num_langs: 2
+encoder_output_size: 640
+encoder_pos_enc_layer_type: rel_pos
+encoder_positional_dropout_rate: 0.1
+encoder_selfattention_layer_type: rel_selfattn
+encoder_use_cnn_module: true
+encoder_use_dynamic_chunk: true
+decoder: lslbitransformer
+decoder_attention_heads: 8
+decoder_dropout_rate: 0.1
+decoder_linear_units: 2048
+decoder_num_blocks: 6
+decoder_num_langs: 2
+decoder_positional_dropout_rate: 0.1
+decoder_r_num_blocks: 6
+decoder_self_attention_dropout_rate: 0.1
+decoder_src_attention_dropout_rate: 0.1
+ctc_blank_id: 0
+ctc_weight: 0.3
+lsm_weight: 0.1
+reverse_weight: 0.3
+special_tokens:
+  "<blank>": 0
+  "<sos>": 2
+  "<eos>": 2
+  "<unk>": 1
+tokenizer_path: "hf-reverb/tk.model"
+units_path: "hf-reverb/tk.units.txt"
+decoder_beam_width: 8
+decoder_token_min_logp: -10 
\ No newline at end of file
diff --git a/examples/huggingface/transcribe.py b/examples/huggingface/transcribe.py
index 458c936..d99077b 100644
--- a/examples/huggingface/transcribe.py
+++ b/examples/huggingface/transcribe.py
@@ -1,5 +1,11 @@
+import sys
+import os
+
+# Add the project root directory to Python path
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../asr/"))
+sys.path.append(project_root)
+
 import numpy as np
-from pyctcdecode import build_ctcdecoder
 import torch
 import torchaudio
 from transformers import pipeline
@@ -9,33 +15,36 @@
 from reverb_processor import ReverbFeatureExtractor, ReverbTokenizer
 
 
+# Register the custom model and config
 AutoConfig.register("reverb_asr", ReverbConfig)
 AutoModelForSpeechSeq2Seq.register(ReverbConfig, ReverbModel)
+
+# Load configuration
+config = ReverbConfig.from_yaml_file("reverb_config.yaml")
+
+# Initialize feature extractor and tokenizer using config
 feature_extractor = ReverbFeatureExtractor(return_tensors='pt')
-tokenizer = ReverbTokenizer(
-    "hf-reverb/tk.model",
-)
-decoder_ids = []
-with open("hf-reverb/tk.units.txt", 'r') as units_file:
-    for line in units_file:
-        token = line.split()[0]
-        if len(token) == 0:
-            continue
-        if token == '<blank>':
-            token = ''
-        decoder_ids.append(token)
-decoder = build_ctcdecoder(decoder_ids)
+tokenizer = ReverbTokenizer(config.tokenizer_path)
+
+# Initialize model
+model = ReverbModel(config)
 
+# Initialize transcription pipeline
 transcribe = pipeline(
     "automatic-speech-recognition",
-    model="hf-reverb",
+    model=model,
     feature_extractor=feature_extractor,
     tokenizer=tokenizer,
     framework='pt',
     device='cpu', #crucial
-    decoder=decoder,
-    decoder_kwargs={"beam_width": 8, "token_min_logp": -10}
+    decoder=config.decoder,
+    decoder_kwargs={
+        "beam_width": config.decoder_beam_width,
+        "token_min_logp": config.decoder_token_min_logp
+    }
 )
+
+# Process audio
 AUDIO_PATH = ""
 waveform, sample_rate = torchaudio.load(AUDIO_PATH, normalize=False)
 #print(waveform)