diff --git a/recipes/Fisher-Callhome-Spanish/ST/espnet-transformer/hparams/conformer.yaml b/recipes/Fisher-Callhome-Spanish/ST/espnet-transformer/hparams/conformer.yaml new file mode 100644 index 0000000000..5959970089 --- /dev/null +++ b/recipes/Fisher-Callhome-Spanish/ST/espnet-transformer/hparams/conformer.yaml @@ -0,0 +1,203 @@ +# ############################################################################ +# Model: E2E ST with Conformer from ESPnet +# Encoder: Conformer Encoder +# Decoder: Transformer Decoder beamsearch +# Tokens: BPE +# losses: CTC + KLdiv (Label Smoothing loss) +# Training: Fisher-Callhome +# Authors: YAO-FEI, CHENG +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made +# The original recipe is from ESPnet: +# https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer.yaml + +debug: False +seed: 8886 +num_workers: 8 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/conformer_espnet_mid_sp/ +ckpt_interval_minutes: 15 # save checkpoint every N min +bleu_file: !ref /bleu.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + +# Data files + +data_folder: !PLACEHOLDER # Folder of the files generated by the preparation script +tokenizer_file: !PLACEHOLDER # .model file corresponding to the Tokenizer model + +# Tokenier initialization +tokenizer: !new:sentencepiece.SentencePieceProcessor + +# Pretrain the tokenizer +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + collect_in: ./tokenizer + loadables: + tokenizer: !ref + paths: + tokenizer: !ref + +# The train logger writes training statistics to a file, as well as stdout. +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +# Features +sample_rate: 16000 +n_fft: 400 +n_mels: 80 + +compute_features: !new:speechbrain.lobes.features.Fbank + sample_rate: !ref + n_fft: !ref + n_mels: !ref + +normalize: !new:speechbrain.processing.features.InputNormalization + norm_type: global + update_until_epoch: 4 + +speed_perturb: !new:speechbrain.processing.speech_augmentation.SpeedPerturb + orig_freq: !ref + speeds: [90, 100, 110] + +# Trainer settings +number_of_epochs: 30 +valid_search_eopch: 100 +batch_size: 8 # this works for 2 GPUs with 11GB +gradient_accumulation: 16 +gradient_clipping: 5.0 +loss_reduction: batchmean +sorting: random + +# stages related parameters +stage_one_epochs: 100 # not gonna changing optimizer in this recipe +lr_adam: 2.5 +lr_sgd: 0.000025 + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + +test_dataloader_opts: + batch_size: !ref + num_workers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 256 +nhead: 4 +num_encoder_layers: 12 +num_decoder_layers: 6 +d_ffn: 2048 +transformer_dropout: 0.1 +activation: !name:torch.nn.GELU +output_neurons: 1000 +vocab_size: 1000 +attention_type: "regularMHA" # "RelPosMHAXL" or "regularMHA" +kernel_size: 15 +encoder_module: conformer + +# Multi-task +# don't forget to uncomment the ctc_lin in modules section (line:190) when using ctc +ctc_weight: 0 +asr_weight: 0 +mt_weight: 0 + +# Outputs +blank_index: 0 +label_smoothing: 0.1 +pad_index: 0 +bos_index: 1 +eos_index: 2 +unk_index: 0 + +# Decoding parameters +min_decode_ratio: 0.0 +max_decode_ratio: 1.0 +valid_search_interval: !ref +valid_beam_size: 10 +test_beam_size: 10 + +############################## models ################################ +CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd + input_shape: (8, 10, 80) + num_blocks: 2 + num_layers_per_block: 1 + out_channels: (256, 256) + kernel_sizes: (3, 3) + strides: (2, 2) + residuals: (False, False) + +Transformer: !new:speechbrain.lobes.models.transformer.ESPNetConformer.E2E # yamllint disable-line rule:line-length + idim: !ref + odim: !ref + adim: !ref + aheads: !ref + wshare: 4 + ldconv_encoder_kernel_length: "21_23_25_27_29_31_33_35_37_39_41_43" + ldconv_usebias: False + eunits: !ref + elayers: !ref + transformer_input_layer: "conv2d" + transformer_encoder_selfattn_layer_type: "rel_selfattn" + transformer_decoder_selfattn_layer_type: "selfattn" + ldconv_decoder_kernel_length: "11_13_15_17_19_21" + dunits: !ref + dlayers: !ref + dropout_rate: !ref + sos: !ref + eos: !ref + ignore_id: !ref + transformer_encoder_pos_enc_layer_type: "rel_pos" + transformer_encoder_activation_type: "swish" + macaron_style: True + use_cnn_module: True + cnn_module_kernel: !ref + +modules: + Transformer: !ref + +model: !new:torch.nn.ModuleList + - [!ref , !ref ] + +# define two optimizers here for two-stage training +Adam: !name:torch.optim.Adam + lr: 0 + betas: (0.9, 0.98) + eps: 0.000000001 + +SGD: !name:torch.optim.SGD + lr: !ref + momentum: 0.99 + nesterov: True + +seq_cost: !new:espnet.nets.pytorch_backend.transformer.label_smoothing_loss.LabelSmoothingLoss + size: !ref + padding_idx: !ref + smoothing: !ref + normalize_length: False + +noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: 35000 + model_size: !ref + +# Checkpoint setting +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + noam_scheduler: !ref + normalizer: !ref + counter: !ref + +bleu_computer: !name:speechbrain.utils.bleu.BLEUStats + merge_words: False +acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats diff --git a/recipes/Fisher-Callhome-Spanish/ST/espnet-transformer/train.py b/recipes/Fisher-Callhome-Spanish/ST/espnet-transformer/train.py new file mode 100644 index 0000000000..55ef764e6a --- /dev/null +++ b/recipes/Fisher-Callhome-Spanish/ST/espnet-transformer/train.py @@ -0,0 +1,575 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Transformer based ST system with Fisher-Callhome. +The system employs an encoder, a decoder, and an attention mechanism +between them. Decoding is performed with beam search coupled with a neural +language model. + +To run this recipe, do the following: +> python train.py hparams/conformer.yaml + +Authors + * YAO-FEI, CHENG 2021 +""" + +import sys +from typing import List +import torch +import logging + +import speechbrain as sb + +from sacremoses import MosesDetokenizer +from hyperpyyaml import load_hyperpyyaml +from speechbrain.utils.distributed import run_on_main + +logger = logging.getLogger(__name__) +en_detoeknizer = MosesDetokenizer(lang="en") + + +class ST(sb.core.Brain): + def compute_forward(self, batch, stage): + batch = batch.to(self.device) + + wavs, wav_lens = batch.sig + + tokens, _ = batch.tokens # for translation task + + # compute features + feats = self.hparams.compute_features(wavs) + current_epoch = self.hparams.epoch_counter.current + feats = self.hparams.normalize(feats, wav_lens, epoch=current_epoch) + + # The input sizes corrsponding to ilens in ESPnet + feature_sizes = torch.round(wav_lens * feats.shape[1]).int() + + # forward modules + if stage == sb.Stage.TEST: + from argparse import Namespace + + pred_pad = None + args = { + "beam_size": 10, + "penalty": 0.3, + "maxlenratio": 0.3, + "minlenratio": 0.0, + "nbest": 1, + } + args = Namespace(**args) + vocabs = read_vocab() + + hyps = [] + for feat in feats: + with torch.no_grad(): + top_b_hyps = self.hparams.Transformer.translate( + feat, args, vocabs + ) + hyp = top_b_hyps[0]["yseq"] + + hyps.append(hyp) + else: + enc_out, enc_mask, pred_pad, pred_mask = self.hparams.Transformer( + feats, feature_sizes, tokens + ) + + # compute outputs + if stage == sb.Stage.TRAIN: + hyps = None + elif stage == sb.Stage.VALID: + hyps = enc_out.argmax(dim=-1) + + return pred_pad, hyps + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given predictions and targets.""" + (pred_pad, hyps,) = predictions + + ids = batch.id + tokens_eos, tokens_eos_lens = batch.tokens_eos + current_epoch = self.hparams.epoch_counter.current + valid_search_interval = self.hparams.valid_search_interval + + loss = torch.tensor(0) + if stage == sb.Stage.TEST: + # 4 references bleu score + predictions = [ + en_detoeknizer.detokenize( + self.hparams.tokenizer.decode_ids(utt_seq).split(" ") + ) + for utt_seq in hyps + ] + + four_references = [ + batch.translation_0, + batch.translation_1, + batch.translation_2, + batch.translation_3, + ] + + targets = [] + for reference in four_references: + detokenized_translation = [ + en_detoeknizer.detokenize(translation.split(" ")) + for translation in reference + ] + targets.append(detokenized_translation) + + self.bleu_metric.append(ids, predictions, targets) + elif stage == sb.Stage.VALID: + if current_epoch % valid_search_interval == 0: + predictions = [ + en_detoeknizer.detokenize( + self.hparams.tokenizer.decode_ids(utt_seq).split(" ") + ) + for utt_seq in hyps + ] + + targets = [ + en_detoeknizer.detokenize(translation.split(" ")) + for translation in batch.translation_0 + ] + self.bleu_metric.append(ids, predictions, [targets]) + + # compute the accuracy of the one-step-forward prediction + self.acc_metric.append(pred_pad, tokens_eos, tokens_eos_lens) + loss = self.hparams.seq_cost(pred_pad, tokens_eos) + else: + loss = self.hparams.seq_cost(pred_pad, tokens_eos) + + return loss + + def fit_batch(self, batch): + """Train the parameters given a single batch in input""" + # check if we need to switch optimizer + # if so change the optimizer from Adam to SGD + self.check_and_reset_optimizer() + predictions = self.compute_forward(batch, sb.Stage.TRAIN) + loss = self.compute_objectives(predictions, batch, sb.Stage.TRAIN) + + # normalize the loss by gradient_accumulation step + (loss / self.hparams.gradient_accumulation).backward() + + if self.step % self.hparams.gradient_accumulation == 0: + # gradient clipping & early stop if loss is not fini + self.check_gradients(loss) + + self.optimizer.step() + self.optimizer.zero_grad() + + # anneal lr every update + self.hparams.noam_annealing(self.optimizer) + + return loss.detach() + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch""" + if stage != sb.Stage.TRAIN: + self.acc_metric = self.hparams.acc_computer() + self.bleu_metric = self.hparams.bleu_computer() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of a epoch.""" + # Compute/store important stats + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + else: + if stage == sb.Stage.VALID: + stage_stats["ACC"] = self.acc_metric.summarize() + current_epoch = self.hparams.epoch_counter.current + valid_search_interval = self.hparams.valid_search_interval + + if stage == sb.Stage.TEST: + stage_stats["BLEU"] = self.bleu_metric.summarize("BLEU") + elif ( + current_epoch % valid_search_interval == 0 + and stage == sb.Stage.VALID + ): + stage_stats["BLEU"] = self.bleu_metric.summarize("BLEU") + + # log stats and save checkpoint at end-of-epoch + if stage == sb.Stage.VALID and sb.utils.distributed.if_main_process(): + current_epoch = self.hparams.epoch_counter.current + + # report different epoch stages according current stage + current_epoch = self.hparams.epoch_counter.current + if current_epoch <= self.hparams.stage_one_epochs: + lr = self.hparams.noam_annealing.current_lr + steps = self.hparams.noam_annealing.n_steps + optimizer = self.optimizer.__class__.__name__ + else: + lr = self.hparams.lr_sgd + steps = -1 + optimizer = self.optimizer.__class__.__name__ + + epoch_stats = { + "epoch": epoch, + "lr": lr, + "steps": steps, + "optimizer": optimizer, + } + self.hparams.train_logger.log_stats( + stats_meta=epoch_stats, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + self.checkpointer.save_and_keep_only( + meta={"ACC": stage_stats["ACC"], "epoch": epoch}, + max_keys=["ACC"], + num_to_keep=5, + ) + + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + + with open(self.hparams.bleu_file, "a+", encoding="utf-8") as w: + self.bleu_metric.write_stats(w) + + # save the averaged checkpoint at the end of the evaluation stage + # delete the rest of the intermediate checkpoints + # ACC is set to 1.1 so checkpointer only keeps the averaged checkpoint + self.checkpointer.save_and_keep_only( + meta={"ACC": 1.1, "epoch": epoch}, + max_keys=["ACC"], + num_to_keep=1, + ) + + def check_and_reset_optimizer(self): + """reset the optimizer if training enters stage 2""" + current_epoch = self.hparams.epoch_counter.current + if not hasattr(self, "switched"): + self.switched = False + if isinstance(self.optimizer, torch.optim.SGD): + self.switched = True + + if self.switched is True: + return + + if current_epoch > self.hparams.stage_one_epochs: + self.optimizer = self.hparams.SGD(self.modules.parameters()) + + if self.checkpointer is not None: + self.checkpointer.add_recoverable("optimizer", self.optimizer) + + self.switched = True + + def on_fit_start(self): + """Initialize the right optimizer on the training start""" + super().on_fit_start() + + # if the model is resumed from stage two, reinitialize the optimizer + current_epoch = self.hparams.epoch_counter.current + current_optimizer = self.optimizer + if current_epoch > self.hparams.stage_one_epochs: + del self.optimizer + self.optimizer = self.hparams.SGD(self.modules.parameters()) + + # Load latest checkpoint to resume training if interrupted + if self.checkpointer is not None: + + # do not reload the weights if training is interrupted right before stage 2 + group = current_optimizer.param_groups[0] + if "momentum" not in group: + return + + self.checkpointer.recover_if_possible( + device=torch.device(self.device) + ) + + def on_evaluate_start(self, max_key=None, min_key=None): + """perform checkpoint averge if needed""" + super().on_evaluate_start() + + ckpts = self.checkpointer.find_checkpoints( + max_key=max_key, min_key=min_key + ) + ckpt = sb.utils.checkpoints.average_checkpoints( + ckpts, recoverable_name="model", device=self.device + ) + + self.hparams.model.load_state_dict(ckpt, strict=True) + self.hparams.model.eval() + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions.""" + + # Define audio pipeline. In this case, we simply read the path contained + # in the variable wav with the audio reader. + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(wav): + """Load the audio signal. This is done on the CPU in the `collate_fn`.""" + sig = sb.dataio.dataio.read_audio(wav) + return sig + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def sp_audio_pipeline(wav): + """Load the audio signal. This is done on the CPU in the `collate_fn`.""" + sig = sb.dataio.dataio.read_audio(wav) + sig = sig.unsqueeze(0) + sig = hparams["speed_perturb"](sig) + sig = sig.squeeze(0) + return sig + + # Define text processing pipeline. We start from the raw text and then + # encode it using the tokenizer. The tokens with BOS are used for feeding + # decoder during training, the tokens with EOS for computing the cost function. + # The tokens without BOS or EOS is for computing CTC loss. + @sb.utils.data_pipeline.takes("translation_0") + @sb.utils.data_pipeline.provides( + "translation_0", "tokens_list", "tokens_bos", "tokens_eos", "tokens", + ) + def one_reference_text_pipeline(translation): + """Processes the transcriptions to generate proper labels""" + yield translation + tokens_list = hparams["tokenizer"].encode_as_ids(translation) + yield tokens_list + tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) + yield tokens_bos + tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) + yield tokens_eos + tokens = torch.LongTensor(tokens_list) + yield tokens + + @sb.utils.data_pipeline.takes( + "translation_0", "translation_1", "translation_2", "translation_3", + ) + @sb.utils.data_pipeline.provides( + "translation_0", + "translation_1", + "translation_2", + "translation_3", + "tokens_list", + "tokens_bos", + "tokens_eos", + "tokens", + ) + def four_reference_text_pipeline(*translations): + """Processes the transcriptions to generate proper labels""" + yield translations[0] + yield translations[1] + yield translations[2] + yield translations[3] + tokens_list = hparams["tokenizer"].encode_as_ids(translations[0]) + yield tokens_list + tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) + yield tokens_bos + tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) + yield tokens_eos + tokens = torch.LongTensor(tokens_list) + yield tokens + + @sb.utils.data_pipeline.takes("transcription") + @sb.utils.data_pipeline.provides( + "transcription", + "transcription_list", + "transcription_bos", + "transcription_eos", + "transcription_tokens", + ) + def transcription_text_pipeline(transcription): + yield transcription + tokens_list = hparams["tokenizer"].encode_as_ids(transcription) + yield tokens_list + tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) + yield tokens_bos + tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) + yield tokens_eos + tokens = torch.LongTensor(tokens_list) + yield tokens + + datasets = {} + data_folder = hparams["data_folder"] + for dataset in ["train-sp", "dev"]: + json_path = f"{data_folder}/{dataset}/data.json" + dataset = dataset if dataset == "train-sp" else "valid" + + is_use_sp = dataset == "train" and "speed_perturb" in hparams + audio_pipeline_func = sp_audio_pipeline if is_use_sp else audio_pipeline + + datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=json_path, + replacements={"data_root": data_folder}, + dynamic_items=[ + audio_pipeline_func, + one_reference_text_pipeline, + transcription_text_pipeline, + ], + output_keys=[ + "id", + "sig", + "duration", + "translation_0", + "tokens_bos", + "tokens_eos", + "tokens", + "transcription", + "transcription_list", + "transcription_bos", + "transcription_eos", + "transcription_tokens", + ], + ) + + for dataset in ["dev", "dev2", "test"]: + json_path = f"{data_folder}/{dataset}/data.json" + datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=json_path, + replacements={"data_root": data_folder}, + dynamic_items=[ + audio_pipeline, + four_reference_text_pipeline, + transcription_text_pipeline, + ], + output_keys=[ + "id", + "sig", + "duration", + "translation_0", + "translation_1", + "translation_2", + "translation_3", + "tokens_bos", + "tokens_eos", + "tokens", + "transcription", + "transcription_list", + "transcription_bos", + "transcription_eos", + "transcription_tokens", + ], + ) + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + # use smaller dataset to debug the model + if hparams["debug"]: + datasets["train"] = datasets["train"].filtered_sorted( + key_min_value={"duration": 1}, + key_max_value={"duration": 5}, + sort_key="duration", + reverse=True, + ) + datasets["valid"] = datasets["valid"].filtered_sorted( + key_min_value={"duration": 1}, + key_max_value={"duration": 5}, + sort_key="duration", + reverse=True, + ) + else: + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="duration" + ) + datasets["valid"] = datasets["valid"].filtered_sorted( + sort_key="duration" + ) + + hparams["train_dataloader_opts"]["shuffle"] = False + hparams["valid_dataloader_opts"]["shuffle"] = False + elif hparams["sorting"] == "descending": + # use smaller dataset to debug the model + if hparams["debug"]: + datasets["train"] = datasets["train"].filtered_sorted( + key_min_value={"duration": 1}, + key_max_value={"duration": 5}, + sort_key="duration", + reverse=True, + ) + datasets["valid"] = datasets["valid"].filtered_sorted( + key_min_value={"duration": 1}, + key_max_value={"duration": 5}, + sort_key="duration", + reverse=True, + ) + else: + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="duration", reverse=True + ) + datasets["valid"] = datasets["valid"].filtered_sorted( + sort_key="duration", reverse=True + ) + + hparams["train_dataloader_opts"]["shuffle"] = False + hparams["valid_dataloader_opts"]["shuffle"] = False + elif hparams["sorting"] == "random": + # use smaller dataset to debug the model + if hparams["debug"]: + datasets["train"] = datasets["train"].filtered_sorted( + key_min_value={"duration": 5}, + key_max_value={"duration": 5}, + sort_key="duration", + ) + datasets["valid"] = datasets["valid"].filtered_sorted( + key_min_value={"duration": 5}, key_max_value={"duration": 5}, + ) + + hparams["train_dataloader_opts"]["shuffle"] = True + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + return datasets + + +def read_vocab() -> List[str]: + with open("../../Tokenizer/save/1000_bpe.vocab") as vocab_file: + lines = vocab_file.readlines() + lines = list(map(lambda line: line[0], lines)) + + return lines + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # transcription/translation tokenizer + run_on_main(hparams["pretrainer"].collect_files) + hparams["pretrainer"].load_collected(device=run_opts["device"]) + + # We can now directly create the datasets for training, valid, and test + datasets = dataio_prepare(hparams) + + st_brain = ST( + modules=hparams["modules"], + opt_class=hparams["Adam"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + st_brain.fit( + st_brain.hparams.epoch_counter, + datasets["train-sp"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + for dataset in ["dev", "dev2", "test"]: + st_brain.evaluate( + datasets[dataset], + test_loader_kwargs=hparams["test_dataloader_opts"], + ) diff --git a/recipes/MATBN/ASR/README.md b/recipes/MATBN/ASR/README.md new file mode 100644 index 0000000000..3b5cb24364 --- /dev/null +++ b/recipes/MATBN/ASR/README.md @@ -0,0 +1,60 @@ +# MATBN ASR with Transformers. +This folder contains recipes for tokenization and speech recognition with MATBN. + +### How to run +1. Train a tokenizer. The tokenizer takes in input the training transcripts and determines the subword units that will be used for both acoustic and language model training. + + ``` + cd ../Tokenizer + python train.py hparams/tokenizer_char5k.yaml --dataset_folder= + ``` +2. Train a language model. (select one of RNNLM and TransformerLM) + + ``` + cd ../LM + python train.py hparams/RNNLM.yaml --tokenizer_file=../Tokenizer/results/tokenizer_char5k/5000_char.model --data_folder=../Tokenizer/results/prepare + ``` + or + ``` + cd ../LM + python train.py hparams/TransformerLM.yaml --tokenizer_file=../Tokenizer/results/tokenizer_char5k/5000_char.model --data_folder=../Tokenizer/results/prepare + ``` + +3. Train the speech recognizer. (select one of RNNLM and TransformerLM) + + ``` + python train.py hparams/transformer_RNNLM.yaml --data_folder=../Tokenizer/results/prepare --tokenizer_file=../Tokenizer/results/tokenizer_char5k/5000_char.model --lm_file= + ``` + or + ``` + python train.py hparams/transformer_TransformerLM.yaml --data_folder=../Tokenizer/results/prepare --tokenizer_file=../Tokenizer/results/tokenizer_char5k/5000_char.model --lm_file= + ``` + +# Performance summary +Results are reported in terms of Character Error Rate (CER). + +| hyperparams file | Test CER | GPUs | +|:--------------------------:| :-----:| :-----: | +| transformer_RNNLM.yaml | 8.41 | 1xGTX1080 8GB | +| transformer_TransformerLM.yaml | 8.25 | 1xGTX1080 8GB | + +# **About SpeechBrain** +- Website: https://speechbrain.github.io/ +- Code: https://github.com/speechbrain/speechbrain/ +- HuggingFace: https://huggingface.co/speechbrain/ + + +# **Citing SpeechBrain** +Please, cite SpeechBrain if you use it for your research or business. + +```bibtex +@misc{speechbrain, + title={{SpeechBrain}: A General-Purpose Speech Toolkit}, + author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio}, + year={2021}, + eprint={2106.04624}, + archivePrefix={arXiv}, + primaryClass={eess.AS}, + note={arXiv:2106.04624} +} +``` \ No newline at end of file diff --git a/recipes/MATBN/ASR/hparams/conformer.yaml b/recipes/MATBN/ASR/hparams/conformer.yaml new file mode 100644 index 0000000000..0686ce8c7c --- /dev/null +++ b/recipes/MATBN/ASR/hparams/conformer.yaml @@ -0,0 +1,219 @@ +output_folder: !ref results/asr_conformer +cer_file: !ref /cer.txt +train_log: !ref /train_log.txt +save_folder: !ref /save +ckpt_interval_minutes: 15 +num_workers: 8 + +data_folder: results/prepare +tokenizer_file: !PLACEHOLDER + +tokenizer: !new:sentencepiece.SentencePieceProcessor + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + collect_in: !ref /tokenizer + loadables: + tokenizer: !ref + paths: + tokenizer: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +# Feature parameters +sample_rate: 16000 +n_fft: 400 +n_mels: 80 +hop_length: 20 + +compute_features: !new:speechbrain.lobes.features.Fbank + sample_rate: !ref + n_fft: !ref + n_mels: !ref + hop_length: !ref + +speed_perturb: !new:speechbrain.processing.speech_augmentation.SpeedPerturb + orig_freq: !ref + speeds: [90, 100, 110] + +# Training parameters +number_of_epochs: 40 +batch_size: 1 +ctc_weight: 0.3 +gradient_accumulation: 32 +gradient_clipping: 5.0 +loss_reduction: batchmean +sorting: random + +# stages related parameters +stage_one_epochs: 100 +lr_adam: 0.25 +lr_sgd: 0.000025 + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + num_workers: !ref + drop_last: True + shuffle: True + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + + +test_dataloader_opts: + batch_size: !ref + num_workers: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 256 +nhead: 4 +num_encoder_layers: 12 +num_decoder_layers: 6 +d_ffn: 2048 +transformer_dropout: 0.1 +activation: !name:torch.nn.GELU +output_neurons: 5000 +vocab_size: 5000 +attention_type: "RelPosMHAXL" +kernel_size: 15 +encoder_module: conformer + +# Outputs +blank_index: 0 +label_smoothing: 0.1 +pad_index: 0 +bos_index: 1 +eos_index: 2 +unk_index: 0 + +# Decoding parameters +min_decode_ratio: 0.0 +max_decode_ratio: 1.0 # 1.0 +valid_search_interval: 10 +valid_beam_size: 10 +test_beam_size: 10 +ctc_weight_decode: 0.40 + +############################## models ################################ + +CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd + input_shape: (8, 10, 80) + num_blocks: 2 + num_layers_per_block: 1 + out_channels: (256, 256) + kernel_sizes: (3, 3) + strides: (2, 2) + residuals: (False, False) + +Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length + input_size: 5120 + tgt_vocab: !ref + d_model: !ref + nhead: !ref + num_encoder_layers: !ref + num_decoder_layers: !ref + d_ffn: !ref + dropout: !ref + activation: !ref + attention_type: !ref + kernel_size: !ref + encoder_module: !ref + normalize_before: True + max_length: 5000 + causal: False + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +seq_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +modules: + CNN: !ref + Transformer: !ref + seq_lin: !ref + ctc_lin: !ref + normalize: !ref + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +# define two optimizers here for two-stage training +Adam: !name:torch.optim.Adam + lr: 0 + betas: (0.9, 0.98) + eps: 0.000000001 + +SGD: !name:torch.optim.SGD + lr: !ref + momentum: 0.99 + nesterov: True + + +valid_search: !new:speechbrain.decoders.S2STransformerBeamSearch + modules: [!ref , !ref , !ref ] + bos_index: !ref + eos_index: !ref + blank_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + ctc_weight: !ref + using_eos_threshold: False + length_normalization: True + +test_search: !new:speechbrain.decoders.S2STransformerBeamSearch + modules: [!ref , !ref , !ref ] + bos_index: !ref + eos_index: !ref + blank_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + ctc_weight: !ref + using_eos_threshold: False + length_normalization: True + +log_softmax: !new:torch.nn.LogSoftmax + dim: -1 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + reduction: !ref + +seq_cost: !name:speechbrain.nnet.losses.kldiv_loss + label_smoothing: !ref + reduction: !ref + +noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: 25000 + model_size: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + noam_scheduler: !ref + normalizer: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +normalize: !new:speechbrain.processing.features.InputNormalization + norm_type: global + update_until_epoch: 4 + +remove_spaces: True +split_tokens: !apply:operator.not_ [!ref ] + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: !ref +acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats diff --git a/recipes/MATBN/ASR/hparams/transformer_RNNLM.yaml b/recipes/MATBN/ASR/hparams/transformer_RNNLM.yaml new file mode 100644 index 0000000000..1e89b07989 --- /dev/null +++ b/recipes/MATBN/ASR/hparams/transformer_RNNLM.yaml @@ -0,0 +1,239 @@ +output_folder: !ref results/asr_transformer_RNNLM +cer_file: !ref /cer.txt +train_log: !ref /train_log.txt +save_folder: !ref /save +ckpt_interval_minutes: 15 +num_workers: 4 + +data_folder: !PLACEHOLDER +tokenizer_file: !PLACEHOLDER +lm_file: !PLACEHOLDER + +tokenizer: !new:sentencepiece.SentencePieceProcessor + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + collect_in: !ref + loadables: + lm: !ref + tokenizer: !ref + paths: + tokenizer: !ref + lm: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +# Feature parameters +sample_rate: 16000 +n_fft: 400 +n_mels: 80 +hop_length: 20 + +compute_features: !new:speechbrain.lobes.features.Fbank + sample_rate: !ref + n_fft: !ref + n_mels: !ref + hop_length: !ref + +speed_perturb: !new:speechbrain.processing.speech_augmentation.SpeedPerturb + orig_freq: !ref + speeds: [90, 100, 110] + +# Training parameters +number_of_epochs: 80 +batch_size: 1 +ctc_weight: 0.3 +gradient_accumulation: 32 +gradient_clipping: 5.0 +loss_reduction: batchmean +sorting: random + +# stages related parameters +stage_one_epochs: 70 +lr_adam: 1.0 +lr_sgd: 0.000025 + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + num_workers: !ref + shuffle: True + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + +test_dataloader_opts: + num_workers: !ref + batch_size: 2 + +####################### Model parameters ########################### +# Transformer +d_model: 256 +nhead: 4 +num_encoder_layers: 12 +num_decoder_layers: 6 +d_ffn: 2048 +transformer_dropout: 0.1 +activation: !name:torch.nn.GELU +output_neurons: 5000 +vocab_size: 5000 + +# Outputs +blank_index: 0 +label_smoothing: 0.1 +pad_index: 0 +bos_index: 1 +eos_index: 2 +unk_index: 0 + +# Decoding parameters +min_decode_ratio: 0.0 +max_decode_ratio: 1.0 # 1.0 +valid_search_interval: 10 +valid_beam_size: 10 +test_beam_size: 10 +ctc_weight_decode: 0.3 +lm_weight: 0.2 + + +############################## models ################################ + +CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd + input_shape: (8, 10, 80) + num_blocks: 2 + num_layers_per_block: 1 + out_channels: (256, 256) + kernel_sizes: (3, 3) + strides: (2, 2) + residuals: (False, False) + +Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length + input_size: 5120 + tgt_vocab: !ref + d_model: !ref + nhead: !ref + num_encoder_layers: !ref + num_decoder_layers: !ref + d_ffn: !ref + dropout: !ref + activation: !ref + normalize_before: True + +lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM + output_neurons: !ref + embedding_dim: 256 + activation: !name:torch.nn.LeakyReLU + dropout: 0.3 + rnn_layers: 2 + rnn_neurons: 512 + dnn_blocks: 1 + dnn_neurons: 256 + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +seq_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +modules: + CNN: !ref + Transformer: !ref + seq_lin: !ref + ctc_lin: !ref + normalize: !ref + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +# define two optimizers here for two-stage training +Adam: !name:torch.optim.Adam + lr: 0 + betas: (0.9, 0.98) + eps: 0.000000001 + +SGD: !name:torch.optim.SGD + lr: !ref + momentum: 0.99 + nesterov: True + + +valid_search: !new:speechbrain.decoders.S2STransformerBeamSearch + modules: [!ref , !ref , !ref ] + bos_index: !ref + eos_index: !ref + blank_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + ctc_weight: !ref + using_eos_threshold: False + length_normalization: True + +test_search: !new:speechbrain.decoders.S2STransformerBeamSearch + modules: [!ref , !ref , !ref ] + bos_index: !ref + eos_index: !ref + blank_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + ctc_weight: !ref + lm_weight: !ref + lm_modules: !ref + temperature: 1.15 + temperature_lm: 1.15 + using_eos_threshold: False + length_normalization: True + +log_softmax: !new:torch.nn.LogSoftmax + dim: -1 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + reduction: !ref + +seq_cost: !name:speechbrain.nnet.losses.kldiv_loss + label_smoothing: !ref + reduction: !ref + +noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: 6000 + model_size: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + noam_scheduler: !ref + normalizer: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +normalize: !new:speechbrain.processing.features.InputNormalization + norm_type: global + update_until_epoch: 4 + +augmentation: !new:speechbrain.lobes.augment.SpecAugment + time_warp: True + time_warp_window: 5 + time_warp_mode: bicubic + freq_mask: True + n_freq_mask: 2 + time_mask: True + n_time_mask: 2 + replace_with_zero: False + freq_mask_width: 30 + time_mask_width: 40 + +remove_spaces: True +split_tokens: !apply:operator.not_ [!ref ] + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: !ref +acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats diff --git a/recipes/MATBN/ASR/hparams/transformer_TransformerLM.yaml b/recipes/MATBN/ASR/hparams/transformer_TransformerLM.yaml new file mode 100644 index 0000000000..7938bb7d22 --- /dev/null +++ b/recipes/MATBN/ASR/hparams/transformer_TransformerLM.yaml @@ -0,0 +1,240 @@ +output_folder: !ref results/asr_transformer_TransformerLM +cer_file: !ref /cer.txt +train_log: !ref /train_log.txt +save_folder: !ref /save +ckpt_interval_minutes: 15 +num_workers: 4 + +data_folder: !PLACEHOLDER +tokenizer_file: !PLACEHOLDER +lm_file: !PLACEHOLDER + +tokenizer: !new:sentencepiece.SentencePieceProcessor + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + collect_in: !ref + loadables: + lm: !ref + tokenizer: !ref + paths: + tokenizer: !ref + lm: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +# Feature parameters +sample_rate: 16000 +n_fft: 400 +n_mels: 80 +hop_length: 20 + +compute_features: !new:speechbrain.lobes.features.Fbank + sample_rate: !ref + n_fft: !ref + n_mels: !ref + hop_length: !ref + +speed_perturb: !new:speechbrain.processing.speech_augmentation.SpeedPerturb + orig_freq: !ref + speeds: [90, 100, 110] + +# Training parameters +number_of_epochs: 80 +batch_size: 1 +ctc_weight: 0.3 +gradient_accumulation: 32 +gradient_clipping: 5.0 +loss_reduction: batchmean +sorting: random + +# stages related parameters +stage_one_epochs: 70 +lr_adam: 1.0 +lr_sgd: 0.000025 + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + num_workers: !ref + shuffle: True + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + +test_dataloader_opts: + batch_size: 2 + num_workers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 256 +nhead: 4 +num_encoder_layers: 12 +num_decoder_layers: 6 +d_ffn: 2048 +transformer_dropout: 0.1 +activation: !name:torch.nn.GELU +output_neurons: 5000 +vocab_size: 5000 + +# Outputs +blank_index: 0 +label_smoothing: 0.1 +pad_index: 0 +bos_index: 1 +eos_index: 2 +unk_index: 0 + +# Decoding parameters +min_decode_ratio: 0.0 +max_decode_ratio: 1.0 # 1.0 +valid_search_interval: 10 +valid_beam_size: 10 +test_beam_size: 10 +ctc_weight_decode: 0.3 +lm_weight: 0.2 + +############################## models ################################ + +CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd + input_shape: (8, 10, 80) + num_blocks: 2 + num_layers_per_block: 1 + out_channels: (256, 256) + kernel_sizes: (3, 3) + strides: (2, 2) + residuals: (False, False) + +Transformer: + !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length + input_size: 5120 + tgt_vocab: !ref + d_model: !ref + nhead: !ref + num_encoder_layers: !ref + num_decoder_layers: !ref + d_ffn: !ref + dropout: !ref + activation: !ref + normalize_before: True + +lm_model: + !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM # yamllint disable-line rule:line-length + vocab: !ref + d_model: 576 + nhead: 6 + num_encoder_layers: 6 + num_decoder_layers: 0 + d_ffn: 1538 + dropout: 0.2 + activation: !name:torch.nn.GELU + normalize_before: False + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +seq_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +modules: + CNN: !ref + Transformer: !ref + seq_lin: !ref + ctc_lin: !ref + normalize: !ref + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +# define two optimizers here for two-stage training +Adam: !name:torch.optim.Adam + lr: 0 + betas: (0.9, 0.98) + eps: 0.000000001 + +SGD: !name:torch.optim.SGD + lr: !ref + momentum: 0.99 + nesterov: True + +valid_search: !new:speechbrain.decoders.S2STransformerBeamSearch + modules: [!ref , !ref , !ref ] + bos_index: !ref + eos_index: !ref + blank_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + ctc_weight: !ref + using_eos_threshold: False + length_normalization: True + +test_search: !new:speechbrain.decoders.S2STransformerBeamSearch + modules: [!ref , !ref , !ref ] + bos_index: !ref + eos_index: !ref + blank_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + ctc_weight: !ref + lm_weight: !ref + lm_modules: !ref + temperature: 1.15 + temperature_lm: 1.15 + using_eos_threshold: False + length_normalization: True + +log_softmax: !new:torch.nn.LogSoftmax + dim: -1 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + reduction: !ref + +seq_cost: !name:speechbrain.nnet.losses.kldiv_loss + label_smoothing: !ref + reduction: !ref + +noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: 6000 + model_size: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + noam_scheduler: !ref + normalizer: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +normalize: !new:speechbrain.processing.features.InputNormalization + norm_type: global + update_until_epoch: 4 + +augmentation: !new:speechbrain.lobes.augment.SpecAugment + time_warp: True + time_warp_window: 5 + time_warp_mode: bicubic + freq_mask: True + n_freq_mask: 2 + time_mask: True + n_time_mask: 2 + replace_with_zero: False + freq_mask_width: 30 + time_mask_width: 40 + +remove_spaces: True +split_tokens: !apply:operator.not_ [!ref ] + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: !ref +acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats diff --git a/recipes/MATBN/ASR/matbn_prepare.py b/recipes/MATBN/ASR/matbn_prepare.py new file mode 120000 index 0000000000..96bb29fb8d --- /dev/null +++ b/recipes/MATBN/ASR/matbn_prepare.py @@ -0,0 +1 @@ +../matbn_prepare.py \ No newline at end of file diff --git a/recipes/MATBN/ASR/train.py b/recipes/MATBN/ASR/train.py new file mode 100644 index 0000000000..afc4cda0e4 --- /dev/null +++ b/recipes/MATBN/ASR/train.py @@ -0,0 +1,322 @@ +import sys + +import torch +import speechbrain as sb +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml + + +class ASR(sb.core.Brain): + def compute_forward(self, batch, stage): + batch = batch.to(self.device) + wavs, wavs_len = batch.sig + tokens_bos, _ = batch.tokens_bos + feats = self.hparams.compute_features(wavs) + current_epoch = self.hparams.epoch_counter.current + feats = self.modules.normalize(feats, wavs_len, epoch=current_epoch) + + src = self.modules.CNN(feats) + enc_out, pred = self.modules.Transformer( + src, tokens_bos, wavs_len, pad_idx=self.hparams.pad_index + ) + + logits = self.modules.ctc_lin(enc_out) + p_ctc = self.hparams.log_softmax(logits) + + pred = self.modules.seq_lin(pred) + p_seq = self.hparams.log_softmax(pred) + + hyps = None + if stage == sb.Stage.TRAIN: + hyps = None + elif stage == sb.Stage.VALID: + hyps = None + current_epoch = self.hparams.epoch_counter.current + if current_epoch % self.hparams.valid_search_interval == 0: + hyps, _ = self.hparams.valid_search(enc_out.detach(), wavs_len) + elif stage == sb.Stage.TEST: + hyps, _ = self.hparams.test_search(enc_out.detach(), wavs_len) + + return p_ctc, p_seq, wavs_len, hyps + + def compute_objectives(self, predictions, batch, stage): + + (p_ctc, p_seq, wavs_len, hyps,) = predictions + + ids = batch.id + tokens_eos, tokens_eos_len = batch.tokens_eos + tokens, tokens_len = batch.tokens + + attention_loss = self.hparams.seq_cost( + p_seq, tokens_eos, length=tokens_eos_len + ) + ctc_loss = self.hparams.ctc_cost(p_ctc, tokens, wavs_len, tokens_len) + loss = ( + self.hparams.ctc_weight * ctc_loss + + (1 - self.hparams.ctc_weight) * attention_loss + ) + + if stage != sb.Stage.TRAIN: + current_epoch = self.hparams.epoch_counter.current + valid_search_interval = self.hparams.valid_search_interval + + if current_epoch % valid_search_interval == 0 or ( + stage == sb.Stage.TEST + ): + predictions = [ + hparams["tokenizer"].decode_ids(utt_seq).split(" ") + for utt_seq in hyps + ] + targets = [ + transcription.split(" ") + for transcription in batch.transcription + ] + if self.hparams.remove_spaces: + predictions = [ + "".join(prediction_words) + for prediction_words in predictions + ] + targets = [ + "".join(target_words) for target_words in targets + ] + self.cer_metric.append(ids, predictions, targets) + + self.acc_metric.append(p_seq, tokens_eos, tokens_eos_len) + + return loss + + def fit_batch(self, batch): + self.check_and_reset_optimizer() + + predictions = self.compute_forward(batch, sb.Stage.TRAIN) + loss = self.compute_objectives(predictions, batch, sb.Stage.TRAIN) + + (loss / self.hparams.gradient_accumulation).backward() + + if self.step % self.hparams.gradient_accumulation == 0: + self.check_gradients(loss) + + self.optimizer.step() + self.optimizer.zero_grad() + + self.hparams.noam_annealing(self.optimizer) + + return loss.detach() + + def evaluate_batch(self, batch, stage): + with torch.no_grad(): + predictions = self.compute_forward(batch, stage=stage) + loss = self.compute_objectives(predictions, batch, stage=stage) + # origin function is call loss.detach().cpu() + return loss.detach() + + def on_stage_start(self, stage, epoch): + if stage != sb.Stage.TRAIN: + self.acc_metric = self.hparams.acc_computer() + self.cer_metric = self.hparams.cer_computer() + + def on_stage_end(self, stage, stage_loss, epoch): + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + else: + stage_stats["ACC"] = self.acc_metric.summarize() + current_epoch = self.hparams.epoch_counter.current + valid_search_interval = self.hparams.valid_search_interval + if ( + current_epoch % valid_search_interval == 0 + or stage == sb.Stage.TEST + ): + stage_stats["CER"] = self.cer_metric.summarize("error_rate") + + if stage == sb.Stage.VALID and sb.utils.distributed.if_main_process(): + + current_epoch = self.hparams.epoch_counter.current + if current_epoch <= self.hparams.stage_one_epochs: + lr = self.hparams.noam_annealing.current_lr + steps = self.hparams.noam_annealing.n_steps + optimizer = self.optimizer.__class__.__name__ + else: + lr = self.hparams.lr_sgd + steps = -1 + optimizer = self.optimizer.__class__.__name__ + + epoch_stats = { + "epoch": epoch, + "lr": lr, + "steps": steps, + "optimizer": optimizer, + } + self.hparams.train_logger.log_stats( + stats_meta=epoch_stats, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + self.checkpointer.save_and_keep_only( + meta={"ACC": stage_stats["ACC"], "epoch": epoch}, + max_keys=["ACC"], + num_to_keep=10, + ) + + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + with open(self.hparams.cer_file, "w") as cer_file: + self.cer_metric.write_stats(cer_file) + + self.checkpointer.save_and_keep_only( + meta={"ACC": 1.1, "epoch": epoch}, + max_keys=["ACC"], + num_to_keep=1, + ) + + def check_and_reset_optimizer(self): + current_epoch = self.hparams.epoch_counter.current + if not hasattr(self, "switched"): + self.switched = False + if isinstance(self.optimizer, torch.optim.SGD): + self.switched = True + + if self.switched is True: + return + + if current_epoch > self.hparams.stage_one_epochs: + self.optimizer = self.hparams.SGD(self.modules.parameters()) + + if self.checkpointer is not None: + self.checkpointer.add_recoverable("optimizer", self.optimizer) + + self.switched = True + + def on_fit_start(self): + """Initialize the right optimizer on the training start""" + super().on_fit_start() + + current_epoch = self.hparams.epoch_counter.current + current_optimizer = self.optimizer + if current_epoch > self.hparams.stage_one_epochs: + del self.optimizer + self.optimizer = self.hparams.SGD(self.modules.parameters()) + + if self.checkpointer is not None: + group = current_optimizer.param_groups[0] + if "momentum" not in group: + return + self.checkpointer.recover_if_possible( + device=torch.device(self.device) + ) + + def on_evaluate_start(self, max_key=None, min_key=None): + super().on_evaluate_start() + + checkpointers = self.checkpointer.find_checkpoints( + max_key=max_key, min_key=min_key + ) + checkpointer = sb.utils.checkpoints.average_checkpoints( + checkpointers, recoverable_name="model", device=self.device + ) + + self.hparams.model.load_state_dict(checkpointer, strict=True) + self.hparams.model.eval() + + +def dataio_prepare(hparams): + @sb.utils.data_pipeline.takes("transcription") + @sb.utils.data_pipeline.provides( + "transcription", "tokens_bos", "tokens_eos", "tokens" + ) + def transcription_pipline(transcription): + yield transcription + tokens_list = hparams["tokenizer"].encode_as_ids(transcription) + tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) + yield tokens_bos + tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) + yield tokens_eos + tokens = torch.LongTensor(tokens_list) + yield tokens + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_pipline(wav): + sig = sb.dataio.dataio.read_audio(wav) + return sig + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def sp_audio_pipline(wav): + sig = sb.dataio.dataio.read_audio(wav) + sig = sig.unsqueeze(0) + sig = hparams["speed_perturb"](sig) + sig = sig.squeeze(0) + return sig + + datasets = {} + data_folder = hparams["data_folder"] + output_keys = [ + "transcription", + "tokens_bos", + "tokens_eos", + "tokens", + "sig", + "id", + ] + default_dynamic_items = [transcription_pipline, audio_pipline] + train_dynamic_item = [transcription_pipline, sp_audio_pipline] + + for dataset_name in ["train", "dev", "test"]: + if dataset_name == "train": + dynamic_items = train_dynamic_item + else: + dynamic_items = default_dynamic_items + + json_path = f"{data_folder}/{dataset_name}.json" + datasets[dataset_name] = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=json_path, + replacements={"data_root": data_folder}, + dynamic_items=dynamic_items, + output_keys=output_keys, + ) + + return datasets + + +if __name__ == "__main__": + hparams_file_path, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + sb.utils.distributed.ddp_init_group(run_opts) + + with open(hparams_file_path) as hparams_file: + hparams = load_hyperpyyaml(hparams_file, overrides) + + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file_path, + overrides=overrides, + ) + + run_on_main(hparams["pretrainer"].collect_files) + hparams["pretrainer"].load_collected(device=run_opts["device"]) + + datasets = dataio_prepare(hparams) + + asr_brain = ASR( + modules=hparams["modules"], + opt_class=hparams["Adam"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + asr_brain.fit( + asr_brain.hparams.epoch_counter, + datasets["train"], + datasets["dev"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # asr_brain.evaluate( + # datasets["test"],max_key="ACC", test_loader_kwargs=hparams["test_dataloader_opts"] + # ) diff --git a/recipes/MATBN/LM/README.md b/recipes/MATBN/LM/README.md new file mode 100644 index 0000000000..e66243bcd7 --- /dev/null +++ b/recipes/MATBN/LM/README.md @@ -0,0 +1,35 @@ +# Language Model with MATBN +This folder contains recipes for training language models for the MATBN Dataset. +It supports both an RNN-based LM and a Transformer-based LM. + +# How to run: +``` +python train.py hparams/RNNLM.yaml --tokenizer_file= --data_folder= +python train.py hparams/TransformerLM.yaml --tokenizer_file= --data_folder= +``` + +| hyperparams file | Test PPL | GPUs | Training time | +| :--- | :---: | :---: | :---: | +| RNNLM.yaml | 5.78 | 1xGTX1080 8G | 1 hours 43 mins | +| TransformerLM.yaml | 5.78 | 1xGTX1080 8G | 1 hours 31 mins | + +# **About SpeechBrain** +- Website: https://speechbrain.github.io/ +- Code: https://github.com/speechbrain/speechbrain/ +- HuggingFace: https://huggingface.co/speechbrain/ + + +# **Citing SpeechBrain** +Please, cite SpeechBrain if you use it for your research or business. + +```bibtex +@misc{speechbrain, + title={{SpeechBrain}: A General-Purpose Speech Toolkit}, + author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio}, + year={2021}, + eprint={2106.04624}, + archivePrefix={arXiv}, + primaryClass={eess.AS}, + note={arXiv:2106.04624} +} +``` \ No newline at end of file diff --git a/recipes/MATBN/LM/hparams/RNNLM.yaml b/recipes/MATBN/LM/hparams/RNNLM.yaml new file mode 100644 index 0000000000..202002b20f --- /dev/null +++ b/recipes/MATBN/LM/hparams/RNNLM.yaml @@ -0,0 +1,92 @@ +output_folder: !ref results/RNNLM +save_folder: !ref /save +train_log: !ref /train_log.txt + +data_folder: !PLACEHOLDER + +tokenizer_file: !PLACEHOLDER + +tokenizer: !new:sentencepiece.SentencePieceProcessor + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + collect_in: !ref /tokenizer + loadables: + tokenizer: !ref + paths: + tokenizer: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +# Training parameters +number_of_epochs: 50 +batch_size: 24 +lr: 0.001 +accumulation_steps: 6 +ckpt_interval_minutes: 15 + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + +# Model parameters +emb_size: 256 +activation: !name:torch.nn.LeakyReLU +dropout: 0.3 +rnn_layers: 2 +rnn_neurons: 512 +dnn_blocks: 1 +dnn_neurons: 256 + +# Outputs +output_neurons: 5000 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +model: !new:speechbrain.lobes.models.RNNLM.RNNLM + output_neurons: !ref + embedding_dim: !ref + activation: !ref + dropout: !ref + rnn_layers: !ref + rnn_neurons: !ref + dnn_blocks: !ref + dnn_neurons: !ref + +modules: + model: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +optimizer: !name:torch.optim.Adam + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + + +compute_cost: !name:speechbrain.nnet.losses.nll_loss diff --git a/recipes/MATBN/LM/hparams/RNNLM_cna.yaml b/recipes/MATBN/LM/hparams/RNNLM_cna.yaml new file mode 100644 index 0000000000..5e6e329de0 --- /dev/null +++ b/recipes/MATBN/LM/hparams/RNNLM_cna.yaml @@ -0,0 +1,95 @@ +output_folder: !ref results/RNNLM_cna +save_folder: !ref /save +train_log: !ref /train_log.txt +num_workers: 4 + +data_folder: results/prepare_cna + +tokenizer_file: results/tokenizer_time_rnnlm/8000_char.model + +tokenizer: !new:sentencepiece.SentencePieceProcessor + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + collect_in: !ref /tokenizer + loadables: + tokenizer: !ref + paths: + tokenizer: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +# Training parameters +number_of_epochs: 50 +batch_size: 128 +lr: 0.001 +accumulation_steps: 1 +ckpt_interval_minutes: 15 + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + num_workers: !ref + shuffle: True + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + +test_dataloader_opts: + batch_size: !ref + num_workers: !ref + +# Model parameters +emb_size: 128 +activation: !name:torch.nn.LeakyReLU +dropout: 0.2 +rnn_layers: 2 +rnn_neurons: 1024 +dnn_blocks: 1 +dnn_neurons: 256 + +# Outputs +output_neurons: 8000 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +model: !new:speechbrain.lobes.models.RNNLM.RNNLM + output_neurons: !ref + embedding_dim: !ref + activation: !ref + dropout: !ref + rnn_layers: !ref + rnn_neurons: !ref + dnn_blocks: !ref + dnn_neurons: !ref + +modules: + model: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +optimizer: !name:torch.optim.Adam + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + +compute_cost: !name:speechbrain.nnet.losses.nll_loss diff --git a/recipes/MATBN/LM/hparams/TransformerLM.yaml b/recipes/MATBN/LM/hparams/TransformerLM.yaml new file mode 100644 index 0000000000..bc6043f7ab --- /dev/null +++ b/recipes/MATBN/LM/hparams/TransformerLM.yaml @@ -0,0 +1,89 @@ +output_folder: !ref results/TransformerLM +save_folder: !ref /save +train_log: !ref /train_log.txt + +data_folder: results/prepare + +tokenizer_file: !PLACEHOLDER + +tokenizer: !new:sentencepiece.SentencePieceProcessor + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + collect_in: !ref /tokenizer + loadables: + tokenizer: !ref + paths: + tokenizer: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +# Training parameters +number_of_epochs: 20 +batch_size: 2 +lr: 1 +accumulation_steps: 64 +ckpt_interval_minutes: 15 + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + pin_memory: True + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + +# Model parameters +d_model: 576 + +# Outputs +output_neurons: 5000 +blank_index: 0 +bos_index: 1 +eos_index: 2 +unk_index: 0 +pad_index: 0 + +model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM # yamllint disable-line rule:line-length + vocab: !ref + d_model: !ref + nhead: 6 + num_encoder_layers: 6 + num_decoder_layers: 0 + d_ffn: 1538 + dropout: 0.2 + activation: !name:torch.nn.GELU + normalize_before: False + +modules: + model: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: 1000 + model_size: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +optimizer: !name:torch.optim.Adam + lr: 0 + betas: (0.9, 0.98) + eps: 0.000000001 + + +compute_cost: !name:speechbrain.nnet.losses.nll_loss diff --git a/recipes/MATBN/LM/matbn_prepare.py b/recipes/MATBN/LM/matbn_prepare.py new file mode 120000 index 0000000000..96bb29fb8d --- /dev/null +++ b/recipes/MATBN/LM/matbn_prepare.py @@ -0,0 +1 @@ +../matbn_prepare.py \ No newline at end of file diff --git a/recipes/MATBN/LM/train.py b/recipes/MATBN/LM/train.py new file mode 100644 index 0000000000..2fb812a8cd --- /dev/null +++ b/recipes/MATBN/LM/train.py @@ -0,0 +1,150 @@ +import sys + +import torch +import speechbrain as sb +from speechbrain.dataio import dataset +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml + + +class LM(sb.core.Brain): + def compute_forward(self, batch, stage): + batch = batch.to(self.device) + tokens_bos, _ = batch.tokens_bos + logits = self.hparams.model(tokens_bos) + pred = self.hparams.log_softmax(logits) + return pred + + def compute_objectives(self, predictions, batch, stage): + batch = batch.to(self.device) + tokens_eos, tokens_len = batch.tokens_eos + loss = self.hparams.compute_cost( + predictions, tokens_eos, length=tokens_len + ) + return loss + + def fit_batch(self, batch): + predictions = self.compute_forward(batch, sb.Stage.TRAIN) + loss = self.compute_objectives(predictions, batch, sb.Stage.TRAIN) + + (loss / self.hparams.accumulation_steps).backward() + + if self.step % self.hparams.accumulation_steps == 0: + self.check_gradients(loss) + + self.optimizer.step() + self.optimizer.zero_grad() + + if isinstance( + self.hparams.lr_annealing, sb.nnet.schedulers.NoamScheduler + ) or isinstance( + self.hparams.lr_annealing, + sb.nnet.schedulers.CyclicCosineScheduler, + ): + self.hparams.lr_annealing(self.optimizer) + + return loss + + def on_stage_end(self, stage, stage_loss, epoch): + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + if stage == sb.Stage.VALID and sb.utils.distributed.if_main_process(): + if not ( + isinstance( + self.hparams.lr_annealing, sb.nnet.schedulers.NoamScheduler + ) + or isinstance( + self.hparams.lr_annealing, + sb.nnet.schedulers.CyclicCosineScheduler, + ) + ): + old_lr, new_lr = self.hparams.lr_annealing(stage_loss) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + else: + old_lr = self.hparams.lr_annealing.current_lr + + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": old_lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + self.checkpointer.save_and_keep_only( + meta=stage_stats, min_keys=["loss"], + ) + + if stage == sb.Stage.TEST and sb.utils.distributed.if_main_process(): + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + + +def dataio_prepare(hparams): + @sb.utils.data_pipeline.takes("transcription") + @sb.utils.data_pipeline.provides( + "transcription", "tokens_bos", "tokens_eos" + ) + def transcription_pipline(transcription): + yield transcription + tokens_list = hparams["tokenizer"].encode_as_ids(transcription) + tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) + yield tokens_bos + tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) + yield tokens_eos + + data_folder = hparams["data_folder"] + datasets = {} + for dataset_name in ["train", "valid", "test"]: + json_path = f"{data_folder}/{dataset_name}.json" + datasets[dataset_name] = dataset.DynamicItemDataset.from_json( + json_path=json_path, + replacements={"data_root": data_folder}, + dynamic_items=[transcription_pipline], + output_keys=["transcription", "tokens_bos", "tokens_eos"], + ) + + return datasets + + +if __name__ == "__main__": + hparams_file_path, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file_path) as hparams_file: + hparams = load_hyperpyyaml(hparams_file, overrides) + + sb.utils.distributed.ddp_init_group(run_opts) + + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file_path, + overrides=overrides, + ) + + run_on_main(hparams["pretrainer"].collect_files) + hparams["pretrainer"].load_collected(device=run_opts["device"]) + + datasets = dataio_prepare(hparams) + + lm_brain = LM( + modules=hparams["modules"], + opt_class=hparams["optimizer"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + lm_brain.fit( + lm_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # evaluation + lm_brain.evaluate( + datasets["test"], + min_key="loss", + test_loader_kwargs=hparams["test_dataloader_opts"], + ) diff --git a/recipes/MATBN/Tokenizer/README.md b/recipes/MATBN/Tokenizer/README.md new file mode 100644 index 0000000000..115d4ad735 --- /dev/null +++ b/recipes/MATBN/Tokenizer/README.md @@ -0,0 +1,27 @@ +# Tokenizer +This folder contains the scripts to train a tokenizer using SentencePiece (https://github.com/google/sentencepiece). The tokenizer is trained on the top of the MATBN training transcriptions. + +# How to run +``` +python train.py hparams/tokenizer_char5k.yaml --dataset_folder= +``` + +# About SpeechBrain +- Website: https://speechbrain.github.io/ +- Code: https://github.com/speechbrain/speechbrain/ +- HuggingFace: https://huggingface.co/speechbrain/ + + +# Citing SpeechBrain +Please, cite SpeechBrain if you use it for your research or business. + +bibtex +@misc{speechbrain, + title={{SpeechBrain}: A General-Purpose Speech Toolkit}, + author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio}, + year={2021}, + eprint={2106.04624}, + archivePrefix={arXiv}, + primaryClass={eess.AS}, + note={arXiv:2106.04624} +} \ No newline at end of file diff --git a/recipes/MATBN/Tokenizer/cna_prepare.py b/recipes/MATBN/Tokenizer/cna_prepare.py new file mode 120000 index 0000000000..c9865d979e --- /dev/null +++ b/recipes/MATBN/Tokenizer/cna_prepare.py @@ -0,0 +1 @@ +../cna_prepare.py \ No newline at end of file diff --git a/recipes/MATBN/Tokenizer/hparams/tokenizer_char5k.yaml b/recipes/MATBN/Tokenizer/hparams/tokenizer_char5k.yaml new file mode 100644 index 0000000000..34beee6e42 --- /dev/null +++ b/recipes/MATBN/Tokenizer/hparams/tokenizer_char5k.yaml @@ -0,0 +1,27 @@ +dataset_folder: !PLACEHOLDER +prepare_folder: results/prepare +output_folder: results/tokenizer_char5k +keep_unk: False + +token_type: char # ["unigram", "bpe", "char"] +token_output: 5000 +character_coverage: 1.0 +annotation_read: transcription + +train_json: !ref /train.json +dev_json: !ref /dev.json +eval_json: !ref /eval.json +test_json: !ref /test.json + + +tokenizer: !name:speechbrain.tokenizers.SentencePiece.SentencePiece + model_dir: !ref + vocab_size: !ref + annotation_train: !ref + annotation_read: !ref + model_type: !ref # ["unigram", "bpe", "char"] + character_coverage: !ref + annotation_list_to_check: [!ref , !ref , !ref ] # yamllint disable-line rule:line-length + annotation_format: json + bos_id: 1 + eos_id: 2 diff --git a/recipes/MATBN/Tokenizer/hparams/tokenizer_time_rnnlm.yaml b/recipes/MATBN/Tokenizer/hparams/tokenizer_time_rnnlm.yaml new file mode 100644 index 0000000000..643104637f --- /dev/null +++ b/recipes/MATBN/Tokenizer/hparams/tokenizer_time_rnnlm.yaml @@ -0,0 +1,40 @@ +dataset_folder: !PLACEHOLDER +prepare_folder: results/prepare_matbn_10 +output_folder: results/tokenizer_time_rnnlm +keep_unk: False +skip_prepare: False + +cna: True +cna_dataset_folder: !PLACEHOLDER +cna_prepare_folder: results/prepare_cna +cna_settings_json_path: !ref /settings.json +cna_before_2000: False +cna_skip_prepare: False + +token_type: char # ["unigram", "bpe", "char"] +token_output: 8000 # index(blank/eos/bos/unk) = 0 +character_coverage: 1.0 +annotation_read: transcription + +train_json: !ref /train.json +dev_json: !ref /dev.json +eval_json: !ref /eval.json +test_json: !ref /test.json + +cna_train_json: !ref /train.json +cna_valid_json: !ref /valid.json +cna_test_json: !ref /test.json +all_train_json: !PLACEHOLDER + +tokenizer: !name:speechbrain.tokenizers.SentencePiece.SentencePiece + model_dir: !ref + vocab_size: !ref + annotation_train: !ref + annotation_read: !ref + model_type: !ref # ["unigram", "bpe", "char"] + character_coverage: !ref + + annotation_list_to_check: [!ref , !ref , !ref , !ref ] + annotation_format: json + bos_id: 1 + eos_id: 2 diff --git a/recipes/MATBN/Tokenizer/matbn_prepare.py b/recipes/MATBN/Tokenizer/matbn_prepare.py new file mode 120000 index 0000000000..96bb29fb8d --- /dev/null +++ b/recipes/MATBN/Tokenizer/matbn_prepare.py @@ -0,0 +1 @@ +../matbn_prepare.py \ No newline at end of file diff --git a/recipes/MATBN/Tokenizer/train.py b/recipes/MATBN/Tokenizer/train.py new file mode 100644 index 0000000000..e888cb39b8 --- /dev/null +++ b/recipes/MATBN/Tokenizer/train.py @@ -0,0 +1,61 @@ +import json +import sys + +import speechbrain as sb +from hyperpyyaml import load_hyperpyyaml +from speechbrain.utils.distributed import run_on_main + +from matbn_prepare import prepare_matbn +from cna_prepare import prepare_cna + +if __name__ == "__main__": + hparams_file_path, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + with open(hparams_file_path) as hparams_file: + hparams = load_hyperpyyaml(hparams_file, overrides) + + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file_path, + overrides=overrides, + ) + + run_on_main( + prepare_matbn, + kwargs={ + "dataset_folder": hparams["dataset_folder"], + "save_folder": hparams["prepare_folder"], + "keep_unk": hparams["keep_unk"], + "skip_prep": hparams["skip_prepare"], + }, + ) + + if hparams["cna"]: + run_on_main( + prepare_cna, + kwargs={ + "dataset_folder": hparams["cna_dataset_folder"], + "save_folder": hparams["cna_prepare_folder"], + "settings_json_path": hparams["cna_settings_json_path"], + "before_2000": hparams["cna_before_2000"], + "skip_prep": hparams["cna_skip_prepare"], + }, + ) + + with open( + hparams["cna_train_json"], encoding="utf-8" + ) as cna_train_file, open( + hparams["train_json"], encoding="utf-8" + ) as train_json: + cna_train_data = json.load(cna_train_file) + train_data = json.load(train_json) + + train_data.update(cna_train_data) + json.dump( + train_data, + open(hparams["all_train_json"], "w", encoding="utf-8"), + indent=2, + ensure_ascii=False, + ) + + hparams["tokenizer"]() diff --git a/recipes/MATBN/cna_prepare.py b/recipes/MATBN/cna_prepare.py new file mode 100644 index 0000000000..33fb017a97 --- /dev/null +++ b/recipes/MATBN/cna_prepare.py @@ -0,0 +1,86 @@ +import logging +import os +from dataclasses import dataclass, is_dataclass, asdict + +import json + +logger = logging.getLogger(__name__) + + +@dataclass +class Data: + date: str + transcription: str + + +class DataClassJSONEncoder(json.JSONEncoder): + def default(self, object): + if is_dataclass(object): + return asdict(object) + return super().default(object) + + +def prepare_cna( + dataset_folder: str, + save_folder: str, + settings_json_path: str, + before_2000: bool, + skip_prep: bool = False, +): + if skip_prep: + return + + if not os.path.exists(save_folder): + os.makedirs(save_folder) + + if check_folders_exist(dataset_folder) is not True: + logger.error("the dataset folder does not exist)") + + settings = {"train": [], "test": [], "valid": []} + + # load setting + with open(settings_json_path, "r") as settings_file: + settings = json.load(settings_file) + + for split_name in ["valid", "test", "train"]: + data = {} + for text_file_name in settings[split_name]: + if not before_2000 and not text_file_name.startswith("20"): + continue + + text_file_path = os.path.join( + dataset_folder, f"{text_file_name}.txt" + ) + with open(text_file_path, "r", encoding="utf-8") as text_file: + for line in text_file.read().splitlines(): + if len(line) > 128 or len(line) < 1: + continue + data[len(data)] = Data( + date=text_file_name, transcription=line + ) + text_file.close() + + save_path = os.path.join(save_folder, f"{split_name}.json") + with open(save_path, "w", encoding="utf-8") as save_file: + json.dump( + data, + save_file, + indent=2, + ensure_ascii=False, + cls=DataClassJSONEncoder, + ) + + +def check_folders_exist(*folders) -> bool: + for folder in folders: + if not os.path.exists(folder): + return False + return True + + +if __name__ == "__main__": + save_folder = "results/prepare_cna" + dataset_folder = "PLACEHOLDER" + settings_json_path = "settings.json" + before_2000 = False + prepare_cna(dataset_folder, save_folder, settings_json_path, before_2000) diff --git a/recipes/MATBN/matbn_prepare.py b/recipes/MATBN/matbn_prepare.py new file mode 100644 index 0000000000..925a024b78 --- /dev/null +++ b/recipes/MATBN/matbn_prepare.py @@ -0,0 +1,111 @@ +import logging +import os +from dataclasses import dataclass, is_dataclass, asdict +from typing import Dict + +import re +import json + +logger = logging.getLogger(__name__) + + +@dataclass +class Data: + wav: str + transcription: str + + +class DataClassJSONEncoder(json.JSONEncoder): + def default(self, object): + if is_dataclass(object): + return asdict(object) + return super().default(object) + + +def prepare_matbn( + dataset_folder: str, + save_folder: str, + keep_unk: bool = False, + skip_prep: bool = False, +): + if skip_prep: + return + + if not os.path.exists(save_folder): + os.makedirs(save_folder) + + splits = ["eval", "train"] # dev, test + + for split in splits: + split_folder = os.path.join(dataset_folder, split) + wav_folder = os.path.join(split_folder, "wav") + data_folder = os.path.join(split_folder, "data") + if check_folders_exist(wav_folder, data_folder) is not True: + logger.error( + "the folder wav or data does not exist (it is expected in the " + "MATBN dataset)" + ) + + text_path = os.path.join(data_folder, "text") + data = extract_data(text_path, wav_folder) + + useful_data = remove_useless_data(data, keep_unk) + + save_path = os.path.join(save_folder, f"{split}.json") + + with open(save_path, "w", encoding="utf-8") as save_file: + json.dump( + useful_data, + save_file, + indent=2, + ensure_ascii=False, + cls=DataClassJSONEncoder, + ) + + +def check_folders_exist(*folders) -> bool: + for folder in folders: + if not os.path.exists(folder): + return False + return True + + +def extract_data(text_path: str, wav_folder: str) -> Dict[str, Data]: + data: Dict[str, Data] = {} + with open(text_path, "r", encoding="utf-8") as text_file: + text_file_lines = text_file.readlines() + for text_file_line in text_file_lines: + split_line = text_file_line.split() + data[split_line[0]] = Data( + wav=os.path.join(wav_folder, f"{split_line[0]}.wav"), + transcription=" ".join(split_line[1:]), + ) + return data + + +def remove_useless_data( + data: Dict[str, Data], keep_unk=False +) -> Dict[str, Data]: + useful_data: Dict[str, Data] = {} + + check_useability_regex = r"[a-zA-Z]+" + if keep_unk: + for key, line in data.items(): + data[key] = Data( + wav=line.wav, + transcription=line.transcription.replace("UNK", "unk"), + ) + check_useability_regex = r"[a-zA-Z]+\b(? 0: + useful_data[key] = line + + return useful_data + + +if __name__ == "__main__": + save_folder = "results/prepare" + dataset_folder = "PLACEHOLDER" + prepare_matbn(dataset_folder, save_folder) diff --git a/speechbrain/lobes/models/transformer/ESPNetConformer.py b/speechbrain/lobes/models/transformer/ESPNetConformer.py new file mode 100644 index 0000000000..99a3d5cb4e --- /dev/null +++ b/speechbrain/lobes/models/transformer/ESPNetConformer.py @@ -0,0 +1,92 @@ +""" +Conformer speech translation model (pytorch). +It is a fusion of `e2e_st_transformer.py` +Refer to: https://arxiv.org/abs/2005.08100 +""" + +from espnet.nets.pytorch_backend.conformer.encoder import Encoder +from speechbrain.lobes.models.transformer.ESPNetTransformer import ( + E2E as E2ETransformer, +) + + +class E2E(E2ETransformer): + """E2E module. + :param int idim: dimension of inputs + :param int odim: dimension of outputs + :param Namespace args: argument Namespace containing options + """ + + def __init__( + self, + idim: int, + odim: int, + adim: int, + aheads: int, + wshare: int, + ldconv_encoder_kernel_length: int, + ldconv_usebias: bool, + eunits: int, + elayers: int, + transformer_input_layer: str, + transformer_encoder_selfattn_layer_type: str, + transformer_decoder_selfattn_layer_type: str, + ldconv_decoder_kernel_length: int, + dunits: int, + dlayers: int, + transformer_encoder_pos_enc_layer_type: str, + transformer_encoder_activation_type: str, + macaron_style: bool = True, + use_cnn_module: bool = True, + cnn_module_kernel: int = 15, + dropout_rate: float = 0.1, + transformer_attn_dropout_rate: float = 0, + sos: int = 1, + eos: int = 2, + ignore_id: int = -1, + ): + """Construct an E2E object. + :param int idim: dimension of inputs + :param int odim: dimension of outputs + :param Namespace args: argument Namespace containing options + """ + super().__init__( + idim, + odim, + adim, + aheads, + wshare, + ldconv_encoder_kernel_length, + ldconv_usebias, + eunits, + elayers, + transformer_input_layer, + transformer_encoder_selfattn_layer_type, + transformer_decoder_selfattn_layer_type, + ldconv_decoder_kernel_length, + dunits, + dlayers, + dropout_rate, + transformer_attn_dropout_rate, + sos, + eos, + ignore_id, + ) + + self.encoder = Encoder( + idim=idim, + attention_dim=adim, + attention_heads=aheads, + linear_units=eunits, + num_blocks=elayers, + input_layer=transformer_input_layer, + dropout_rate=dropout_rate, + positional_dropout_rate=dropout_rate, + attention_dropout_rate=transformer_attn_dropout_rate, + pos_enc_layer_type=transformer_encoder_pos_enc_layer_type, + selfattention_layer_type=transformer_encoder_selfattn_layer_type, + activation_type=transformer_encoder_activation_type, + macaron_style=macaron_style, + use_cnn_module=use_cnn_module, + cnn_module_kernel=cnn_module_kernel, + ) diff --git a/speechbrain/lobes/models/transformer/ESPNetTransformer.py b/speechbrain/lobes/models/transformer/ESPNetTransformer.py new file mode 100644 index 0000000000..f2792d1145 --- /dev/null +++ b/speechbrain/lobes/models/transformer/ESPNetTransformer.py @@ -0,0 +1,305 @@ +# Borrow from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/e2e_st_transformer.py + +"""Transformer speech translation model (pytorch).""" + +from argparse import Namespace +import logging + +import torch + +from espnet.nets.e2e_asr_common import end_detect +from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask +from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos +from espnet.nets.pytorch_backend.transformer.attention import ( + MultiHeadedAttention, +) +from espnet.nets.pytorch_backend.transformer.decoder import Decoder +from espnet.nets.pytorch_backend.transformer.encoder import Encoder +from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask +from espnet.nets.pytorch_backend.transformer.mask import target_mask +from espnet.nets.st_interface import STInterface + + +class E2E(STInterface, torch.nn.Module): + """E2E module. + :param int idim: dimension of inputs + :param int odim: dimension of outputs + :param Namespace args: argument Namespace containing options + """ + + def __init__( + self, + idim: int, + odim: int, + adim: int, + aheads: int, + wshare: int, + ldconv_encoder_kernel_length: int, + ldconv_usebias: bool, + eunits: int, + elayers: int, + transformer_input_layer: str, + transformer_encoder_selfattn_layer_type: str, + transformer_decoder_selfattn_layer_type: str, + ldconv_decoder_kernel_length: int, + dunits: int, + dlayers: int, + dropout_rate: float = 0.1, + transformer_attn_dropout_rate: float = 0, + sos: int = 1, + eos: int = 2, + ignore_id: int = 0, + ): + """Construct an E2E object. + :param int idim: dimension of inputs + :param int odim: dimension of outputs + :param Namespace args: argument Namespace containing options + """ + torch.nn.Module.__init__(self) + + self.encoder = Encoder( + idim=idim, + selfattention_layer_type=transformer_encoder_selfattn_layer_type, + attention_dim=adim, + attention_heads=aheads, + conv_wshare=wshare, + conv_kernel_length=ldconv_encoder_kernel_length, + conv_usebias=ldconv_usebias, + linear_units=eunits, + num_blocks=elayers, + input_layer=transformer_input_layer, + dropout_rate=dropout_rate, + positional_dropout_rate=dropout_rate, + attention_dropout_rate=transformer_attn_dropout_rate, + ) + + self.decoder = Decoder( + odim=odim, + selfattention_layer_type=transformer_decoder_selfattn_layer_type, + attention_dim=adim, + attention_heads=aheads, + conv_wshare=wshare, + conv_kernel_length=ldconv_decoder_kernel_length, + conv_usebias=ldconv_usebias, + linear_units=dunits, + num_blocks=dlayers, + dropout_rate=dropout_rate, + positional_dropout_rate=dropout_rate, + self_attention_dropout_rate=transformer_attn_dropout_rate, + src_attention_dropout_rate=transformer_attn_dropout_rate, + ) + + self.pad = 0 # use for padding + self.sos = sos + self.eos = eos + self.odim = odim + self.ignore_id = ignore_id + + def forward(self, xs_pad, ilens, ys_pad): + """E2E forward. + :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim) + :param torch.Tensor ilens: batch of lengths of source sequences (B) + :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) + """ + # 1. forward encoder + xs_pad = xs_pad[:, : max(ilens)] # for data parallel + src_mask = ( + make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2) + ) + hs_pad, hs_mask = self.encoder(xs_pad, src_mask) + + # 2. forward decoder + ys_in_pad, ys_out_pad = add_sos_eos( + ys_pad, self.sos, self.eos, self.ignore_id + ) + ys_mask = target_mask(ys_in_pad, self.ignore_id) + pred_pad, pred_mask = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask) + + return hs_pad, hs_mask, pred_pad, pred_mask + + def scorers(self): + """Scorers.""" + return dict(decoder=self.decoder) + + def encode(self, x): + """Encode source acoustic features. + :param ndarray x: source acoustic feature (T, D) + :return: encoder outputs + :rtype: torch.Tensor + """ + self.eval() + x = torch.as_tensor(x).unsqueeze(0) + enc_output, _ = self.encoder(x, None) + return enc_output.squeeze(0) + + def translate( # noqa: C901 + self, x, trans_args, char_list=None, + ): + """Translate input speech. + :param ndnarray x: input acoustic feature (B, T, D) or (T, D) + :param Namespace trans_args: argment Namespace contraining options + :param list char_list: list of characters + :return: N-best decoding results + :rtype: list + """ + # preprate sos + if getattr(trans_args, "tgt_lang", False): + if self.replace_sos: + y = char_list.index(trans_args.tgt_lang) + else: + y = self.sos + logging.info(" index: " + str(y)) + logging.info(" mark: " + char_list[y]) + logging.info("input lengths: " + str(x.shape[0])) + + enc_output = self.encode(x).unsqueeze(0) + + h = enc_output + + logging.info("encoder output lengths: " + str(h.size(1))) + # search parms + beam = trans_args.beam_size + penalty = trans_args.penalty + + if trans_args.maxlenratio == 0: + maxlen = h.size(1) + else: + # maxlen >= 1 + maxlen = max(1, int(trans_args.maxlenratio * h.size(1))) + minlen = int(trans_args.minlenratio * h.size(1)) + logging.info("max output length: " + str(maxlen)) + logging.info("min output length: " + str(minlen)) + + # initialize hypothesis + hyp = {"score": 0.0, "yseq": [y]} + hyps = [hyp] + ended_hyps = [] + + for i in range(maxlen): + logging.debug("position " + str(i)) + + # batchfy + ys = h.new_zeros((len(hyps), i + 1), dtype=torch.int64) + for j, hyp in enumerate(hyps): + ys[j, :] = torch.tensor(hyp["yseq"]) + ys_mask = subsequent_mask(i + 1).unsqueeze(0).to(h.device) + + local_scores = self.decoder.forward_one_step( + ys, ys_mask, h.repeat([len(hyps), 1, 1]) + )[0] + + hyps_best_kept = [] + for j, hyp in enumerate(hyps): + local_best_scores, local_best_ids = torch.topk( + local_scores[j : j + 1], beam, dim=1 + ) + + for j in range(beam): + new_hyp = {} + new_hyp["score"] = hyp["score"] + float( + local_best_scores[0, j] + ) + new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"])) + new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"] + new_hyp["yseq"][len(hyp["yseq"])] = int( + local_best_ids[0, j] + ) + # will be (2 x beam) hyps at most + hyps_best_kept.append(new_hyp) + + hyps_best_kept = sorted( + hyps_best_kept, key=lambda x: x["score"], reverse=True + )[:beam] + + # sort and get nbest + hyps = hyps_best_kept + logging.debug("number of pruned hypothes: " + str(len(hyps))) + if char_list is not None: + logging.debug( + "best hypo: " + + "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]]) + ) + + # add eos in the final loop to avoid that there are no ended hyps + if i == maxlen - 1: + logging.info("adding in the last position in the loop") + for hyp in hyps: + hyp["yseq"].append(self.eos) + + # add ended hypothes to a final list, and removed them from current hypothes + # (this will be a probmlem, number of hyps < beam) + remained_hyps = [] + for hyp in hyps: + if hyp["yseq"][-1] == self.eos: + # only store the sequence that has more than minlen outputs + # also add penalty + if len(hyp["yseq"]) > minlen: + hyp["score"] += (i + 1) * penalty + ended_hyps.append(hyp) + else: + remained_hyps.append(hyp) + + # end detection + if end_detect(ended_hyps, i) and trans_args.maxlenratio == 0.0: + logging.info("end detected at %d", i) + break + + hyps = remained_hyps + if len(hyps) > 0: + logging.debug("remeined hypothes: " + str(len(hyps))) + else: + logging.info("no hypothesis. Finish decoding.") + break + + if char_list is not None: + for hyp in hyps: + logging.debug( + "hypo: " + + "".join([char_list[int(x)] for x in hyp["yseq"][1:]]) + ) + + logging.debug("number of ended hypothes: " + str(len(ended_hyps))) + + nbest_hyps = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[ + : min(len(ended_hyps), trans_args.nbest) + ] + + # check number of hypotheis + if len(nbest_hyps) == 0: + logging.warning( + "there is no N-best results, perform translation " + "again with smaller minlenratio." + ) + # should copy becasuse Namespace will be overwritten globally + trans_args = Namespace(**vars(trans_args)) + trans_args.minlenratio = max(0.0, trans_args.minlenratio - 0.1) + return self.translate(x, trans_args, char_list) + + logging.info("total log probability: " + str(nbest_hyps[0]["score"])) + logging.info( + "normalized log probability: " + + str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"])) + ) + return nbest_hyps + + def calculate_all_attentions(self, xs_pad, ilens, ys_pad, ys_pad_src): + """E2E attention calculation. + :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim) + :param torch.Tensor ilens: batch of lengths of input sequences (B) + :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax) + :param torch.Tensor ys_pad_src: + batch of padded token id sequence tensor (B, Lmax) + :return: attention weights (B, H, Lmax, Tmax) + :rtype: float ndarray + """ + self.eval() + with torch.no_grad(): + self.forward(xs_pad, ilens, ys_pad, ys_pad_src) + ret = dict() + for name, m in self.named_modules(): + if ( + isinstance(m, MultiHeadedAttention) and m.attn is not None + ): # skip MHA for submodules + ret[name] = m.attn.cpu().numpy() + self.train() + return ret diff --git a/tests/.run-doctests.sh b/tests/.run-doctests.sh index f036441286..126f490d13 100755 --- a/tests/.run-doctests.sh +++ b/tests/.run-doctests.sh @@ -5,5 +5,5 @@ set -e -u -o pipefail # > pytest --doctest-modules speechbrain/ # However, we take this more complex approach to avoid testing files not # tracked by git. We filter out tests that require optional dependencies. -avoid="transducer_loss.py\|fairseq_wav2vec.py\|huggingface_wav2vec.py\|bleu.py" +avoid="transducer_loss.py\|fairseq_wav2vec.py\|huggingface_wav2vec.py\|bleu.py\|ESPNetConformer.py\|ESPNetTransformer.py" git ls-files speechbrain | grep -e "\.py$" | grep -v $avoid | xargs pytest --doctest-modules diff --git a/tools/but_phone_recognizer.py b/tools/but_phone_recognizer.py new file mode 100644 index 0000000000..43873ca095 --- /dev/null +++ b/tools/but_phone_recognizer.py @@ -0,0 +1,101 @@ +import os + +from enum import Enum +from typing import List + +from helper.HTK import HTKFile + +import docker +import torch +import numpy as np + + +class RecognizeSystem(Enum): + """ + The Systems of BUT + ref: https://speech.fit.vutbr.cz/software/phoneme-recognizer-based-long-temporal-context + + PHN_CZ_SPDAT_LCRC_N1500 - 8kHz, 2 block STC, trained on Czech SpeechDat-E + PHN_HU_SPDAT_LCRC_N1500 - 8kHz, 2 block STC, trained on Hungarian SpeechDat-E + PHN_RU_SPDAT_LCRC_N1500 - 8kHz, 2 block STC, trained on Russian SpeechDat-E + PHN_EN_TIMIT_LCRC_N500 - 16kHz, 2 block STC, trained on TIMIT, 15 banks + """ + + CZECH = "PHN_CZ_SPDAT_LCRC_N1500" + HUNGARIAN = "PHN_HU_SPDAT_LCRC_N1500" + RUSSIAN = "PHN_RU_SPDAT_LCRC_N1500" + TIMIT = "PHN_EN_TIMIT_LCRC_N500" + + def __str__(self) -> str: + return str(self.value) + + +def read_HTK_file(file_path: str) -> torch.Tensor: + """Read HTK file and return""" + htk_reader = HTKFile() + htk_reader.load(file_path) + + result = np.array(htk_reader.data) + + return torch.from_numpy(result) + + +def read_phone_label(file_path: str) -> List[str]: + """Read the given file to get phone labels""" + phones = [] + with open(file_path, "r", encoding="utf-8") as phone_file: + phone_lines = phone_file.readlines() + + for phone_line in phone_lines: + phone_line = phone_line.split(" ") + phones.append(phone_line[2]) + + return phones + + +def recognize_phone_label( + mount_path: str, wav_path: str, system: RecognizeSystem +) -> List[str]: + """Recognize the given file and return the phone labels""" + client = docker.from_env() + audio_name = wav_path.split(".")[0] + feature_file = f"{audio_name}.fea" + + command = ( + f"./PhnRec/phnrec -v -c ./PhnRec/{system} " + f"-i /usr/src/results/{wav_path} " + f"-o /usr/src/results/{feature_file}" + ) + client.containers.run( + "phnrec", + volumes={mount_path: {"bind": "/usr/src/results", "mode": "rw"}}, + command=command, + ) + + features = read_phone_label(feature_file) + + # clean up + os.remove(feature_file) + + return features + + +def recognize_phone_posteriors( + mount_path: str, wav_path: str, system: RecognizeSystem, +): + """Recognize the given wav, and produce the result based on the recognize type""" + client = docker.from_env() + output_format = "-t post " + audio_name = wav_path.split(".")[0] + feature_file = f"{audio_name}.fea" + + command = ( + f"./PhnRec/phnrec -v -c ./PhnRec/{system} {output_format} " + f"-i /usr/src/results/{wav_path} " + f"-o /usr/src/results/{feature_file}" + ) + client.containers.run( + "phnrec", + volumes={mount_path: {"bind": "/usr/src/results", "mode": "rw"}}, + command=command, + ) diff --git a/tools/helper/HTK.py b/tools/helper/HTK.py new file mode 100644 index 0000000000..b3748f9e66 --- /dev/null +++ b/tools/helper/HTK.py @@ -0,0 +1,162 @@ +import struct + +# Ref: https://github.com/danijel3/PyHTK/blob/master/python/HTKFeat.py +# A helper class for reading HTK format file + + +class HTKFile: + """ Class to load binary HTK file. + Details on the format can be found online in HTK Book chapter 5.7.1. + Not everything is implemented 100%, but most features should be supported. + Not implemented: + CRC checking - files can have CRC, but it won't be checked for correctness + VQ - Vector features are not implemented. + """ + + data = None + nSamples = 0 + nFeatures = 0 + sampPeriod = 0 + basicKind = None + qualifiers = None + endian = ">" + + def load(self, filename): # noqa: C901 + """ Loads HTK file. + After loading the file you can check the following members: + data (matrix) - data contained in the file + nSamples (int) - number of frames in the file + nFeatures (int) - number if features per frame + sampPeriod (int) - sample period in 100ns units (e.g. fs=16 kHz -> 625) + basicKind (string) - basic feature kind saved in the file + qualifiers (string) - feature options present in the file + """ + with open(filename, "rb") as f: + header = f.read(12) + self.nSamples, self.sampPeriod, sampSize, paramKind = struct.unpack( + ">iihh", header + ) + + if self.nSamples < 0 or self.sampPeriod < 0 or sampSize < 0: + self.endian = "<" + ( + self.nSamples, + self.sampPeriod, + sampSize, + paramKind, + ) = struct.unpack(self.endian + "iihh", header) + + basicParameter = paramKind & 0x3F + + if basicParameter == 0: + self.basicKind = "WAVEFORM" + elif basicParameter == 1: + self.basicKind = "LPC" + elif basicParameter == 2: + self.basicKind = "LPREFC" + elif basicParameter == 3: + self.basicKind = "LPCEPSTRA" + elif basicParameter == 4: + self.basicKind = "LPDELCEP" + elif basicParameter == 5: + self.basicKind = "IREFC" + elif basicParameter == 6: + self.basicKind = "MFCC" + elif basicParameter == 7: + self.basicKind == "FBANK" + elif basicParameter == 8: + self.basicKind == "MELSPEC" + elif basicParameter == 9: + self.basicKind = "USER" + elif basicParameter == 10: + self.basicKind = "DISCRETE" + elif basicParameter == 11: + self.basicKind = "PLP" + else: + self.basicKind = "ERROR" + + self.qualifiers = [] + if (paramKind & 0o100) != 0: + self.qualifiers.append("E") + if (paramKind & 0o200) != 0: + self.qualifiers.append("N") + if (paramKind & 0o400) != 0: + self.qualifiers.append("D") + if (paramKind & 0o1000) != 0: + self.qualifiers.append("A") + if (paramKind & 0o2000) != 0: + self.qualifiers.append("C") + if (paramKind & 0o4000) != 0: + self.qualifiers.append("Z") + if (paramKind & 0o10000) != 0: + self.qualifiers.append("K") + if (paramKind & 0o20000) != 0: + self.qualifiers.append("0") + if (paramKind & 0o40000) != 0: + self.qualifiers.append("V") + if (paramKind & 0o100000) != 0: + self.qualifiers.append("T") + + if ( + "C" in self.qualifiers + or "V" in self.qualifiers + or self.basicKind == "IREFC" + or self.basicKind == "WAVEFORM" + ): + self.nFeatures = sampSize // 2 + else: + self.nFeatures = sampSize // 4 + + if "C" in self.qualifiers: + self.nSamples -= 4 + + if "V" in self.qualifiers: + raise NotImplementedError("VQ is not implemented") + + self.data = [] + if self.basicKind == "IREFC" or self.basicKind == "WAVEFORM": + for x in range(self.nSamples): + s = f.read(sampSize) + frame = [] + for v in range(self.nFeatures): + val = ( + struct.unpack_from(self.endian + "h", s, v * 2)[0] + / 32767.0 + ) + frame.append(val) + self.data.append(frame) + elif "C" in self.qualifiers: + A = [] + s = f.read(self.nFeatures * 4) + for x in range(self.nFeatures): + A.append(struct.unpack_from(self.endian + "f", s, x * 4)[0]) + B = [] + s = f.read(self.nFeatures * 4) + for x in range(self.nFeatures): + B.append(struct.unpack_from(self.endian + "f", s, x * 4)[0]) + + for x in range(self.nSamples): + s = f.read(sampSize) + frame = [] + for v in range(self.nFeatures): + frame.append( + ( + struct.unpack_from(self.endian + "h", s, v * 2)[ + 0 + ] + + B[v] + ) + / A[v] + ) + self.data.append(frame) + else: + for x in range(self.nSamples): + s = f.read(sampSize) + frame = [] + for v in range(self.nFeatures): + val = struct.unpack_from(self.endian + "f", s, v * 4) + frame.append(val[0]) + self.data.append(frame) + + if "K" in self.qualifiers: + print("CRC checking not implememnted...")