Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
6d0cc5c
Add but phone recognizer support
Aug 5, 2021
68a3c1e
Merge pull request #4 from Sinica-SLAM/feature/BUT-phone-recognizer
Aug 6, 2021
9772d75
Merge branch 'speechbrain:develop' into develop
Aug 6, 2021
5637a75
Merge branch 'speechbrain:develop' into develop
Aug 11, 2021
cacaac6
Merge branch 'speechbrain:develop' into develop
Aug 16, 2021
03e6d33
Merge branch 'speechbrain:develop' into develop
Aug 18, 2021
b660f88
Merge branch 'speechbrain:develop' into develop
Aug 20, 2021
9d7c66c
Add matbn_prepare and complete prepare function
txya900619 Jul 27, 2021
4013d73
Modify matbm_prepare.py, change duraion from save end to save end - s…
txya900619 Jul 27, 2021
11dd9b7
Fix matbn_prepare.py bug, add custom JSON encoder to serialize dataclass
txya900619 Jul 30, 2021
b7fd7d1
Add tokenizer
txya900619 Jul 31, 2021
35b5720
Fix remove_useless_transcripts keep_unk useless bug
txya900619 Jul 31, 2021
44ed331
Add bos_id and eos_id to tokenizer hparams
txya900619 Aug 2, 2021
cecdbc7
Add language model
txya900619 Aug 5, 2021
0c7c790
Change eval to test, test and dev to valid, and let model smaller
txya900619 Aug 13, 2021
4d138af
Change prepare data structure to fit asr
txya900619 Aug 13, 2021
421a368
Add transformer asr
txya900619 Aug 13, 2021
b1f697d
Merge branch 'speechbrain:develop' into develop
Aug 24, 2021
b41027d
Fix prepare wav path problem, add transformer LM
txya900619 Aug 24, 2021
0cb661b
Merge branch 'speechbrain:develop' into develop
Aug 29, 2021
396e286
Add speech augmentation to recipe and delete eval
txya900619 Sep 3, 2021
74dcf6f
Merge branch 'speechbrain:develop' into develop
Sep 7, 2021
773a266
Update hparams and delete eval in LM
txya900619 Sep 8, 2021
e65cfa2
Add hparams file for conformer
txya900619 Sep 8, 2021
433c203
Replce some hparams to modulesm, let model can parallel
txya900619 Sep 14, 2021
8a2f290
Add normalize to modules
txya900619 Sep 14, 2021
e3e5133
Update yaml add normalize to modules
txya900619 Sep 14, 2021
5951643
Merge branch 'speechbrain:develop' into develop
freddy5566 Sep 22, 2021
bb1acd8
Add ESPnet Transformer and Conformer
Oct 1, 2021
b676775
Merge branch 'speechbrain:develop' into develop
freddy5566 Oct 1, 2021
7a3fc21
Comment out test_filterbank
Oct 1, 2021
4092902
Merge pull request #7 from Sinica-SLAM/feature/espnet-st
Oct 1, 2021
01e5cc7
Merge branch 'speechbrain:develop' into develop
freddy5566 Oct 19, 2021
90e4474
Ignore ESPnet related tests
Oct 19, 2021
86464e4
Merge pull request #8 from Sinica-SLAM/fix/ignore-espnet-test
Oct 19, 2021
bc4a423
Fix conflict
Nov 30, 2021
7922051
Merge branch 'speechbrain-develop' into develop
Nov 30, 2021
8db3fad
Update hyperparams
txya900619 Nov 30, 2021
ddcc615
Add README.md
txya900619 Nov 30, 2021
374d72a
Merge pull request #2 from Sinica-SLAM/feature/MATBN
Dec 1, 2021
afe105d
Merge branch 'speechbrain:develop' into develop
freddy5566 Dec 1, 2021
c3153ed
Merge branch 'speechbrain:develop' into develop
freddy5566 Dec 18, 2021
f6cbe2a
Merge branch 'speechbrain:develop' into develop
freddy5566 Dec 28, 2021
69ac6cb
Merge branch 'speechbrain:develop' into develop
freddy5566 Jan 17, 2022
3721840
Merge branch 'speechbrain:develop' into develop
freddy5566 Jan 19, 2022
3a5778e
Merge branch 'speechbrain:develop' into develop
freddy5566 Feb 15, 2022
7a14d45
Add time-rnnlm-baseline script and yaml
txya900619 Feb 19, 2022
3548818
Update RNNLM_cna.yaml
txya900619 Feb 21, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# ############################################################################
# Model: E2E ST with Conformer from ESPnet
# Encoder: Conformer Encoder
# Decoder: Transformer Decoder beamsearch
# Tokens: BPE
# losses: CTC + KLdiv (Label Smoothing loss)
# Training: Fisher-Callhome
# Authors: YAO-FEI, CHENG
# ############################################################################
# Seed needs to be set at top of yaml, before objects with parameters are made
# The original recipe is from ESPnet:
# https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer.yaml

debug: False
seed: 8886
num_workers: 8
__set_seed: !apply:torch.manual_seed [!ref <seed>]
output_folder: !ref results/conformer_espnet_mid_sp/<seed>
ckpt_interval_minutes: 15 # save checkpoint every N min
bleu_file: !ref <output_folder>/bleu.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data files

data_folder: !PLACEHOLDER # Folder of the files generated by the preparation script
tokenizer_file: !PLACEHOLDER # .model file corresponding to the Tokenizer model

# Tokenier initialization
tokenizer: !new:sentencepiece.SentencePieceProcessor

# Pretrain the tokenizer
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
collect_in: ./tokenizer
loadables:
tokenizer: !ref <tokenizer>
paths:
tokenizer: !ref <tokenizer_file>

# The train logger writes training statistics to a file, as well as stdout.
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>

# Features
sample_rate: 16000
n_fft: 400
n_mels: 80

compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>

normalize: !new:speechbrain.processing.features.InputNormalization
norm_type: global
update_until_epoch: 4

speed_perturb: !new:speechbrain.processing.speech_augmentation.SpeedPerturb
orig_freq: !ref <sample_rate>
speeds: [90, 100, 110]

# Trainer settings
number_of_epochs: 30
valid_search_eopch: 100
batch_size: 8 # this works for 2 GPUs with 11GB
gradient_accumulation: 16
gradient_clipping: 5.0
loss_reduction: batchmean
sorting: random

# stages related parameters
stage_one_epochs: 100 # not gonna changing optimizer in this recipe
lr_adam: 2.5
lr_sgd: 0.000025

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>

# Dataloader options
train_dataloader_opts:
batch_size: !ref <batch_size>
num_workers: !ref <num_workers>

valid_dataloader_opts:
batch_size: !ref <batch_size>
num_workers: !ref <num_workers>

test_dataloader_opts:
batch_size: !ref <batch_size>
num_workers: !ref <num_workers>

####################### Model parameters ###########################
# Transformer
d_model: 256
nhead: 4
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 1000
vocab_size: 1000
attention_type: "regularMHA" # "RelPosMHAXL" or "regularMHA"
kernel_size: 15
encoder_module: conformer

# Multi-task
# don't forget to uncomment the ctc_lin in modules section (line:190) when using ctc
ctc_weight: 0
asr_weight: 0
mt_weight: 0

# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
unk_index: 0

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: !ref <valid_search_eopch>
valid_beam_size: 10
test_beam_size: 10

############################## models ################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 2
num_layers_per_block: 1
out_channels: (256, 256)
kernel_sizes: (3, 3)
strides: (2, 2)
residuals: (False, False)

Transformer: !new:speechbrain.lobes.models.transformer.ESPNetConformer.E2E # yamllint disable-line rule:line-length
idim: !ref <n_mels>
odim: !ref <vocab_size>
adim: !ref <d_model>
aheads: !ref <nhead>
wshare: 4
ldconv_encoder_kernel_length: "21_23_25_27_29_31_33_35_37_39_41_43"
ldconv_usebias: False
eunits: !ref <d_ffn>
elayers: !ref <num_encoder_layers>
transformer_input_layer: "conv2d"
transformer_encoder_selfattn_layer_type: "rel_selfattn"
transformer_decoder_selfattn_layer_type: "selfattn"
ldconv_decoder_kernel_length: "11_13_15_17_19_21"
dunits: !ref <d_ffn>
dlayers: !ref <num_decoder_layers>
dropout_rate: !ref <transformer_dropout>
sos: !ref <bos_index>
eos: !ref <eos_index>
ignore_id: !ref <pad_index>
transformer_encoder_pos_enc_layer_type: "rel_pos"
transformer_encoder_activation_type: "swish"
macaron_style: True
use_cnn_module: True
cnn_module_kernel: !ref <kernel_size>

modules:
Transformer: !ref <Transformer>

model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <Transformer>]

# define two optimizers here for two-stage training
Adam: !name:torch.optim.Adam
lr: 0
betas: (0.9, 0.98)
eps: 0.000000001

SGD: !name:torch.optim.SGD
lr: !ref <lr_sgd>
momentum: 0.99
nesterov: True

seq_cost: !new:espnet.nets.pytorch_backend.transformer.label_smoothing_loss.LabelSmoothingLoss
size: !ref <vocab_size>
padding_idx: !ref <pad_index>
smoothing: !ref <label_smoothing>
normalize_length: False

noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
lr_initial: !ref <lr_adam>
n_warmup_steps: 35000
model_size: !ref <d_model>

# Checkpoint setting
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
model: !ref <model>
noam_scheduler: !ref <noam_annealing>
normalizer: !ref <normalize>
counter: !ref <epoch_counter>

bleu_computer: !name:speechbrain.utils.bleu.BLEUStats
merge_words: False
acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats
Loading