diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_hard.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_hard.py new file mode 100644 index 0000000000..58a7a5141e --- /dev/null +++ b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_hard.py @@ -0,0 +1,63 @@ +from sisyphus import tk +import os + +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ + get_fairseq_root, \ + run_fairseq_pretraining + + +# pretraining +other_target_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_negatives_other_target_v1", + commit="1397363c5c0e3c4e3ab620be562730399c852493", + python_exe_hash_overwrite="itc_python_launcher_py310_torch", + negative_sampling_strategy="other_target", +) + + +neg_hard_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_negatives_hard_v1", + commit="be51394d876428ad531e0786d80de43d6a8818af", + python_exe_hash_overwrite="itc_python_launcher_py310_torch", + negative_sampling_strategy="hard_negatives", + ) + +neg_hard_pretrain_jobs = dict() +neg_hard_pretrain_jobs[0] = neg_hard_pretrain_job +for start_cp in [50, 100, 150, 200, 250]: + neg_hard_pretrain_jobs[start_cp] = run_fairseq_pretraining( + exp_name=f"monophone_negatives_hard_after_{start_cp}ep_other_v1", + commit="be51394d876428ad531e0786d80de43d6a8818af", + python_exe_hash_overwrite="itc_python_launcher_py310_torch", + checkpoint=other_target_pretrain_job.out_models[start_cp].model, + negative_sampling_strategy="hard_negatives", + ) + +# fairseq root +fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) + +# Finetuning +base_model_conf = { + "_name": "wav2vec_ctc", + "apply_mask": True, + "mask_prob": 0.65, + "mask_channel_prob": 0.5, + "mask_channel_length": 64, + "layerdrop": 0.1, + "activation_dropout": 0.1, + "feature_grad_mult": 0.0, + "freeze_finetune_updates": 10000, # was 0 in fairseq config +} + +for start_cp in [50, 100, 150, 200, 250]: + for additional_cp in range(50, 600+1-start_cp, 50): + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = neg_hard_pretrain_jobs[start_cp].out_models[start_cp + additional_cp].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join("w2v_negatives_hard", f"other_{start_cp}_hard_{additional_cp}"), + fairseq_root=fairseq_root, + ) diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other.py new file mode 100644 index 0000000000..6709948d55 --- /dev/null +++ b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other.py @@ -0,0 +1,123 @@ +from sisyphus import tk +import os + +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ + get_fairseq_root, \ + run_fairseq_pretraining + + +# pretraining +neg_other_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_negatives_other_target_v1", + commit="1397363c5c0e3c4e3ab620be562730399c852493", + python_exe_hash_overwrite="itc_python_launcher_py310_torch", + negative_sampling_strategy="other_target", + ) + +# fairseq root +fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) + +# Finetuning + +base_model_conf = { + "_name": "wav2vec_ctc", + "apply_mask": True, + "mask_prob": 0.65, + "mask_channel_prob": 0.5, + "mask_channel_length": 64, + "layerdrop": 0.1, + "activation_dropout": 0.1, + "feature_grad_mult": 0.0, + "freeze_finetune_updates": 10000, # was 0 in fairseq config +} + +checkpoints = [100, 200, 300, 400, 500, 600] +for checkpoint in checkpoints: + # negative sampling + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[checkpoint].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join("w2v_neg_sampling_other_target", f"checkpoint_{checkpoint}"), + fairseq_root=fairseq_root, + ) + + +# finetuning experiments only for the last checkpoint +final_cp = 600 +# random vs phoneme mask in finetuning +model_conf_w2v = base_model_conf.copy() # base model, no need to set `mask_strategy` and `mask_length` +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[final_cp].model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "random_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[final_cp].model +model_conf_w2v["mask_strategy"] = "phonemes" +model_conf_w2v["mask_length"] = 1 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) + +# phoneme mask lengths in finetuning +for mask_len in [1, 2]: + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[final_cp].model + model_conf_w2v["mask_strategy"] = "phonemes" + model_conf_w2v["mask_length"] = mask_len + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + f"{mask_len}_phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, + ) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[final_cp].model +model_conf_w2v["mask_strategy"] = "phonemes" +model_conf_w2v["mask_length"] = 1 +model_conf_w2v["mask_selection"] = "uniform" +model_conf_w2v["mask_other"] = 1 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "1_2_phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) + +# mask probability in finetuning +for mask_prob in [0.35, 0.5, 0.65, 0.8]: + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[final_cp].model + model_conf_w2v["mask_strategy"] = "phonemes" + model_conf_w2v["mask_prob"] = mask_prob + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + f"{str(mask_prob).replace('.', '_')}_phoneme_mask_prob", # replace "." with "_" for the folder name + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, + ) diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other_phoneme_boundary.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other_phoneme_boundary.py new file mode 100644 index 0000000000..b57c091de2 --- /dev/null +++ b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other_phoneme_boundary.py @@ -0,0 +1,132 @@ +from sisyphus import tk +import os + +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ + get_fairseq_root, \ + run_fairseq_pretraining + +# Pretraining +neg_other_trg_phon_boundary_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_negatives_other_target_boundary_masking_v1", + commit="87dec4ffcba2fd71e8838ca099a09816cddeff5b", + negative_sampling_strategy="other_target", + mask_strategy="phonemes", + mask_length=1, + ) + +# fairseq root +fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) + +# Finetuning +base_model_conf = { + "_name": "wav2vec_ctc", + "apply_mask": True, + "mask_prob": 0.65, + "mask_channel_prob": 0.5, + "mask_channel_length": 64, + "layerdrop": 0.1, + "activation_dropout": 0.1, + "feature_grad_mult": 0.0, + "freeze_finetune_updates": 10000, # was 0 in fairseq config +} + +checkpoints = [100, 200, 300, 400, 500, 600] +for checkpoint in checkpoints: + # negative sampling + phoneme boundary masking + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[checkpoint].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + f"checkpoint_{checkpoint}" + ), + fairseq_root=fairseq_root, + ) + + +# finetuning experiments only for the last checkpoint +final_cp = 600 +# random vs phoneme mask in finetuning +model_conf_w2v = base_model_conf.copy() # base model, no need to set `mask_strategy` and `mask_length` +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model +model_conf_w2v["mask_strategy"] = "random" +model_conf_w2v["mask_length"] = 10 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "random_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) + +# phoneme mask lengths in finetuning +model_conf_w2v = base_model_conf.copy() # base model, no need to set `mask_length` +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "1_phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model +model_conf_w2v["mask_length"] = 2 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "2_phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model +model_conf_w2v["mask_length"] = 1 +model_conf_w2v["mask_other"] = 1 +model_conf_w2v["mask_selection"] = "uniform" +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "1_2_phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) + +# mask probability in finetuning +for mask_prob in [0.35, 0.5, 0.65, 0.8]: + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model + model_conf_w2v["mask_prob"] = mask_prob + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + f"{str(mask_prob).replace('.', '_')}_phoneme_mask_prob", # replace '.' with '_' + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, + ) diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_boundary.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_boundary.py new file mode 100644 index 0000000000..776a88637e --- /dev/null +++ b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_boundary.py @@ -0,0 +1,129 @@ +from sisyphus import tk +import os + +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ + get_fairseq_root, \ + run_fairseq_pretraining \ + +# Pretraining +phon_boundary_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_boundary_masking_v1", + commit="87dec4ffcba2fd71e8838ca099a09816cddeff5b", + python_exe_hash_overwrite="itc_python_launcher_py310_torch", + mask_strategy="phonemes", + mask_length=1, + ) + +# fairseq root +fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) + +# Finetuning +base_model_conf = { + "_name": "wav2vec_ctc", + "apply_mask": True, + "mask_prob": 0.65, + "mask_channel_prob": 0.5, + "mask_channel_length": 64, + "layerdrop": 0.1, + "activation_dropout": 0.1, + "feature_grad_mult": 0.0, + "freeze_finetune_updates": 10000, # was 0 in fairseq config +} + +checkpoints = [100, 200, 300, 400, 500, 600] +for checkpoint in checkpoints: + # phoneme boundary masking + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[checkpoint].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join("w2v_phoneme_boundary_masking", f"checkpoint_{checkpoint}"), + fairseq_root=fairseq_root, + ) + +# finetuning experiments only for the last checkpoint +final_cp = 600 +# random vs phoneme mask in finetuning +model_conf_w2v = base_model_conf.copy() # base model, no need to set `mask_strategy` and `mask_length` +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[final_cp].model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[final_cp].model +model_conf_w2v["mask_strategy"] = "random" +model_conf_w2v["mask_length"] = 10 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "random_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) + +# phoneme mask lengths in finetuning +model_conf_w2v = base_model_conf.copy() # base model, no need to set `mask_strategy` and `mask_length` +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[final_cp].model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "1_phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[final_cp].model +model_conf_w2v["mask_strategy"] = "phonemes" +model_conf_w2v["mask_length"] = 2 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "2_phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[final_cp].model +model_conf_w2v["mask_strategy"] = "phonemes" +model_conf_w2v["mask_other"] = 1 +model_conf_w2v["mask_selection"] = "uniform" +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "1_2_phoneme_spec", + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, +) + +# mask probability in finetuning +for mask_prob in [0.35, 0.5, 0.65, 0.8]: + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[final_cp].model + model_conf_w2v["mask_prob"] = mask_prob + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + f"{str(mask_prob).replace('.', '_')}_phoneme_mask_prob", # replace '.' with '_' + f"checkpoint_{final_cp}" + ), + fairseq_root=fairseq_root, + ) diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_pretrain_finetune.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_pretrain_finetune.py deleted file mode 100644 index 4680efc1b3..0000000000 --- a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_pretrain_finetune.py +++ /dev/null @@ -1,63 +0,0 @@ -from sisyphus import tk -import os - -from i6_experiments.users.vieting.experiments.librispeech.\ - librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base -from i6_experiments.users.vieting.experiments.librispeech.\ - librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ - get_fairseq_root, \ - run_fairseq_pretraining_negatives_other_target, \ - run_fairseq_pretraining_phoneme_boundary_masking, \ - run_fairseq_pretraining_phoneme_negatives_other_target_boundary_masking - -# Pretraining -neg_other_trg_pretrain_job = run_fairseq_pretraining_negatives_other_target() -phon_boundary_pretrain_job = run_fairseq_pretraining_phoneme_boundary_masking() -neg_other_trg_phon_boundary_pretrain_job = run_fairseq_pretraining_phoneme_negatives_other_target_boundary_masking() - -# fairseq root -fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) - -# Finetuning -base_model_conf = { - "_name": "wav2vec_ctc", - "apply_mask": True, - "mask_prob": 0.65, - "mask_channel_prob": 0.5, - "mask_channel_length": 64, - "layerdrop": 0.1, - "activation_dropout": 0.1, - "feature_grad_mult": 0.0, - "freeze_finetune_updates": 10000, # was 0 in fairseq config -} -checkpoints = [100, 200, 300, 400, 500, 600] -for checkpoint in checkpoints: - # negative sampling - model_conf_w2v = base_model_conf.copy() - model_conf_w2v["w2v_path"] = neg_other_trg_pretrain_job.out_models[checkpoint].model - eow_phon_ls100_ctc_base( - model_conf_w2v=model_conf_w2v, - train_name_suffix=os.path.join("w2v_neg_sampling_other_target", f"checkpoint_{checkpoint}"), - fairseq_root=fairseq_root, - ) - - # phoneme boundary masking - model_conf_w2v = base_model_conf.copy() - model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[checkpoint].model - eow_phon_ls100_ctc_base( - model_conf_w2v=model_conf_w2v, - train_name_suffix=os.path.join("w2v_phoneme_boundary_masking", f"checkpoint_{checkpoint}"), - fairseq_root=fairseq_root, - ) - - # negative sampling + phoneme boundary masking - model_conf_w2v = base_model_conf.copy() - model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[checkpoint].model - eow_phon_ls100_ctc_base( - model_conf_w2v=model_conf_w2v, - train_name_suffix=os.path.join( - "w2v_neg_sampling_other_target_phoneme_boundary_masking", - f"checkpoint_{checkpoint}" - ), - fairseq_root=fairseq_root, - ) diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_positives.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_positives.py new file mode 100644 index 0000000000..c743c1d3b5 --- /dev/null +++ b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_positives.py @@ -0,0 +1,57 @@ +from sisyphus import tk +import os + +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ + get_fairseq_root, \ + run_fairseq_pretraining + +# pretraining +# positive sampling +pos_sampling_n_pretrain_job = { + 5: run_fairseq_pretraining( + exp_name="monophone_positive_sampling_5_v2", + commit="24d7d72c1e00f69689dc8a8ba2e0d75fe5f1cccd", + num_positives=5, + ), + 10: run_fairseq_pretraining( + exp_name="monophone_positive_sampling_10_v2", + commit="24d7d72c1e00f69689dc8a8ba2e0d75fe5f1cccd", + num_positives=10, + ), + 15: run_fairseq_pretraining( + exp_name="monophone_positive_sampling_15_v2", + commit="24d7d72c1e00f69689dc8a8ba2e0d75fe5f1cccd", + num_positives=15, + ), +} + + +# fairseq root +fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) + +# Finetuning +base_model_conf = { + "_name": "wav2vec_ctc", + "apply_mask": True, + "mask_prob": 0.65, + "mask_channel_prob": 0.5, + "mask_channel_length": 64, + "layerdrop": 0.1, + "activation_dropout": 0.1, + "feature_grad_mult": 0.0, + "freeze_finetune_updates": 10000, # was 0 in fairseq config +} + +# finetuning +for checkpoint in [100, 200, 300, 400, 500, 600]: + for n, pos_pretrain_job in pos_sampling_n_pretrain_job.items(): + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = pos_pretrain_job.out_models[checkpoint].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join("w2v_positive_sampling", f"pos_samples_{n}", f"checkpoint_{checkpoint}"), + fairseq_root=fairseq_root, + ) diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_vanilla_pretrain_finetune.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_vanilla.py similarity index 100% rename from users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_vanilla_pretrain_finetune.py rename to users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_vanilla.py