Sinica-SLAM · txya900619 · Aug 5, 2021 · Aug 6, 2021 · Aug 6, 2021 · Aug 11, 2021
diff --git a/recipes/Fisher-Callhome-Spanish/ST/espnet-transformer/hparams/conformer.yaml b/recipes/Fisher-Callhome-Spanish/ST/espnet-transformer/hparams/conformer.yaml
@@ -0,0 +1,203 @@
+# ############################################################################
+# Model: E2E ST with Conformer from ESPnet
+# Encoder: Conformer Encoder
+# Decoder: Transformer Decoder beamsearch
+# Tokens: BPE
+# losses: CTC + KLdiv (Label Smoothing loss)
+# Training: Fisher-Callhome
+# Authors: YAO-FEI, CHENG
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+# The original recipe is from ESPnet:
+# https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer.yaml
+
+debug: False
+seed: 8886
+num_workers: 8
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/conformer_espnet_mid_sp/<seed>
+ckpt_interval_minutes: 15 # save checkpoint every N min
+bleu_file: !ref <output_folder>/bleu.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+
+data_folder: !PLACEHOLDER # Folder of the files generated by the preparation script
+tokenizer_file: !PLACEHOLDER # .model file corresponding to the Tokenizer model
+
+# Tokenier initialization
+tokenizer: !new:sentencepiece.SentencePieceProcessor
+
+# Pretrain the tokenizer
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    collect_in: ./tokenizer
+    loadables:
+        tokenizer: !ref <tokenizer>
+    paths:
+        tokenizer: !ref <tokenizer_file>
+
+# The train logger writes training statistics to a file, as well as stdout.
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+# Features
+sample_rate: 16000
+n_fft: 400
+n_mels: 80
+
+compute_features: !new:speechbrain.lobes.features.Fbank
+    sample_rate: !ref <sample_rate>
+    n_fft: !ref <n_fft>
+    n_mels: !ref <n_mels>
+
+normalize: !new:speechbrain.processing.features.InputNormalization
+    norm_type: global
+    update_until_epoch: 4
+
+speed_perturb: !new:speechbrain.processing.speech_augmentation.SpeedPerturb
+    orig_freq: !ref <sample_rate>
+    speeds: [90, 100, 110]
+
+# Trainer settings
+number_of_epochs: 30
+valid_search_eopch: 100
+batch_size: 8 # this works for 2 GPUs with 11GB
+gradient_accumulation: 16
+gradient_clipping: 5.0
+loss_reduction: batchmean
+sorting: random
+
+# stages related parameters
+stage_one_epochs: 100 # not gonna changing optimizer in this recipe
+lr_adam: 2.5
+lr_sgd: 0.000025
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+
+test_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 256
+nhead: 4
+num_encoder_layers: 12
+num_decoder_layers: 6
+d_ffn: 2048
+transformer_dropout: 0.1
+activation: !name:torch.nn.GELU
+output_neurons: 1000
+vocab_size: 1000
+attention_type: "regularMHA" # "RelPosMHAXL" or "regularMHA"
+kernel_size: 15
+encoder_module: conformer
+
+# Multi-task
+# don't forget to uncomment the ctc_lin in modules section (line:190) when using ctc
+ctc_weight: 0
+asr_weight: 0
+mt_weight: 0
+
+# Outputs
+blank_index: 0
+label_smoothing: 0.1
+pad_index: 0
+bos_index: 1
+eos_index: 2
+unk_index: 0
+
+# Decoding parameters
+min_decode_ratio: 0.0
+max_decode_ratio: 1.0
+valid_search_interval: !ref <valid_search_eopch>
+valid_beam_size: 10
+test_beam_size: 10
+
+############################## models ################################
+CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
+    input_shape: (8, 10, 80)
+    num_blocks: 2
+    num_layers_per_block: 1
+    out_channels: (256, 256)
+    kernel_sizes: (3, 3)
+    strides: (2, 2)
+    residuals: (False, False)
+
+Transformer: !new:speechbrain.lobes.models.transformer.ESPNetConformer.E2E # yamllint disable-line rule:line-length
+    idim: !ref <n_mels>
+    odim: !ref <vocab_size>
+    adim: !ref <d_model>
+    aheads: !ref <nhead>
+    wshare: 4
+    ldconv_encoder_kernel_length: "21_23_25_27_29_31_33_35_37_39_41_43"
+    ldconv_usebias: False
+    eunits: !ref <d_ffn>
+    elayers: !ref <num_encoder_layers>
+    transformer_input_layer: "conv2d"
+    transformer_encoder_selfattn_layer_type: "rel_selfattn"
+    transformer_decoder_selfattn_layer_type: "selfattn"
+    ldconv_decoder_kernel_length: "11_13_15_17_19_21"
+    dunits: !ref <d_ffn>
+    dlayers: !ref <num_decoder_layers>
+    dropout_rate: !ref <transformer_dropout>
+    sos: !ref <bos_index>
+    eos: !ref <eos_index>
+    ignore_id: !ref <pad_index>
+    transformer_encoder_pos_enc_layer_type: "rel_pos"
+    transformer_encoder_activation_type: "swish"
+    macaron_style: True
+    use_cnn_module: True
+    cnn_module_kernel: !ref <kernel_size>
+
+modules:
+    Transformer: !ref <Transformer>
+
+model: !new:torch.nn.ModuleList
+    - [!ref <CNN>, !ref <Transformer>]
+
+# define two optimizers here for two-stage training
+Adam: !name:torch.optim.Adam
+    lr: 0
+    betas: (0.9, 0.98)
+    eps: 0.000000001
+
+SGD: !name:torch.optim.SGD
+    lr: !ref <lr_sgd>
+    momentum: 0.99
+    nesterov: True
+
+seq_cost: !new:espnet.nets.pytorch_backend.transformer.label_smoothing_loss.LabelSmoothingLoss
+    size: !ref <vocab_size>
+    padding_idx: !ref <pad_index>
+    smoothing: !ref <label_smoothing>
+    normalize_length: False
+
+noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr_adam>
+    n_warmup_steps: 35000
+    model_size: !ref <d_model>
+
+# Checkpoint setting
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        noam_scheduler: !ref <noam_annealing>
+        normalizer: !ref <normalize>
+        counter: !ref <epoch_counter>
+
+bleu_computer: !name:speechbrain.utils.bleu.BLEUStats
+    merge_words: False
+acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats