diff --git a/README.md b/README.md index e5c6b7f..ba99286 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,23 @@ The loss curves, synthesized mel-spectrograms, and audios are shown. ![](./img/tensorboard_spec_vctk.png) ![](./img/tensorboard_audio_vctk.png) +## Ablation Study +![](./img/tensorboard_loss_ljs_comparison.png) + +| ID | Model | Block Type | Pitch Conditioning | +| --- | --- | ----------- | ----- | +|1|LJSpeech_transformer_fs2_cwt| `transformer_fs2` | continuous wavelet transform +|2|LJSpeech_transformer_cwt| `transformer` | continuous wavelet transform +|3|LJSpeech_transformer_frame| `transformer` | frame-level f0 +|4|LJSpeech_transformer_ph| `transformer` | phoneme-level f0 + +Observations from +1. changing building block (ID 1~2): + "transformer_fs2" seems to be more optimized in terms of memory usage and model size so that the training time and mel losses are decreased. However, the output quality is not improved dramatically, and sometimes the "transformer" block generates speech with an even more stable pitch contour than "transformer_fs2". +2. changing pitch conditioning (ID 2~4): There is a trade-off between audio quality (pitch stability) and expressiveness. + - audio quality: "ph" >= "frame" > "cwt" + - expressiveness: "cwt" > "frame" > "ph" + # Notes - Both phoneme-level and frame-level variance are supported in both supervised and unsupervised duration modeling. @@ -175,6 +192,10 @@ The loss curves, synthesized mel-spectrograms, and audios are shown. - For vocoder, **HiFi-GAN** and **MelGAN** are supported. ### Updates Log +- Mar.05, 2022 (v0.2.1): Fix and update codebase & pre-trained models with demo samples + 1. Fix variance adaptor to make it work with all combinations of building block and variance type/level + 2. Update pre-trained models with demo samples of LJSpeech and VCTK under "transformer_fs2" building block and "cwt" pitch conditioning + 3. Share the result of ablation studies of comparing "transformer" vs. "transformer_fs2" paired among three types of pitch conditioning ("frame", "ph", and "cwt") - Feb.18, 2022 (v0.2.0): Update data preprocessor and variance adaptor & losses following [keonlee9420's DiffSinger](https://github.com/keonlee9420/DiffSinger) / Add various prosody modeling methods 1. Prepare two different types of data pipeline in preprocessor to maximize unsupervised/supervised duration modelings 2. Adopt wavelet for pitch modeling & loss diff --git a/dataset.py b/dataset.py index c1631cc..d971722 100644 --- a/dataset.py +++ b/dataset.py @@ -125,12 +125,6 @@ def __getitem__(self, idx): ) f0cwt_mean_std = np.load(f0cwt_mean_std_path) f0_mean, f0_std = float(f0cwt_mean_std[0]), float(f0cwt_mean_std[1]) - elif self.pitch_type == "ph": - f0_phlevel_sum = torch.zeros(phone.shape).float().scatter_add( - 0, torch.from_numpy(mel2ph).long() - 1, torch.from_numpy(f0).float()) - f0_phlevel_num = torch.zeros(phone.shape).float().scatter_add( - 0, torch.from_numpy(mel2ph).long() - 1, torch.ones(f0.shape)).clamp_min(1) - f0_ph = (f0_phlevel_sum / f0_phlevel_num).numpy() sample = { "id": basename, @@ -140,7 +134,6 @@ def __getitem__(self, idx): "mel": mel, "pitch": pitch, "f0": f0, - "f0_ph": f0_ph, "uv": uv, "cwt_spec": cwt_spec, "f0_mean": f0_mean, @@ -187,8 +180,6 @@ def reprocess(self, data, idxs): cwt_specs = pad_2D(cwt_specs) f0_means = np.array(f0_means) f0_stds = np.array(f0_stds) - elif self.pitch_type == "ph": - f0s = [data[idx]["f0_ph"] for idx in idxs] energies = [data[idx]["energy"] for idx in idxs] durations = [data[idx]["duration"] for idx in idxs] if not self.learn_alignment else None mel2phs = [data[idx]["mel2ph"] for idx in idxs] if not self.learn_alignment else None diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0092.png b/demo/LJSpeech_v0.2.1/900000/LJ001-0092.png new file mode 100644 index 0000000..fc9bc9f Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0092.png differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0092.wav b/demo/LJSpeech_v0.2.1/900000/LJ001-0092.wav new file mode 100644 index 0000000..4f12612 Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0092.wav differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0133.png b/demo/LJSpeech_v0.2.1/900000/LJ001-0133.png new file mode 100644 index 0000000..c5fa7f0 Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0133.png differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0133.wav b/demo/LJSpeech_v0.2.1/900000/LJ001-0133.wav new file mode 100644 index 0000000..d828ec5 Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0133.wav differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0142.png b/demo/LJSpeech_v0.2.1/900000/LJ001-0142.png new file mode 100644 index 0000000..98a22be Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0142.png differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0142.wav b/demo/LJSpeech_v0.2.1/900000/LJ001-0142.wav new file mode 100644 index 0000000..8d4cea8 Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0142.wav differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0147.png b/demo/LJSpeech_v0.2.1/900000/LJ001-0147.png new file mode 100644 index 0000000..bb72e77 Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0147.png differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0147.wav b/demo/LJSpeech_v0.2.1/900000/LJ001-0147.wav new file mode 100644 index 0000000..8fc441f Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0147.wav differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0151.png b/demo/LJSpeech_v0.2.1/900000/LJ001-0151.png new file mode 100644 index 0000000..3ee68df Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0151.png differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0151.wav b/demo/LJSpeech_v0.2.1/900000/LJ001-0151.wav new file mode 100644 index 0000000..3fa3d3a Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0151.wav differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0159.png b/demo/LJSpeech_v0.2.1/900000/LJ001-0159.png new file mode 100644 index 0000000..b49b70c Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0159.png differ diff --git a/demo/LJSpeech_v0.2.1/900000/LJ001-0159.wav b/demo/LJSpeech_v0.2.1/900000/LJ001-0159.wav new file mode 100644 index 0000000..f10e881 Binary files /dev/null and b/demo/LJSpeech_v0.2.1/900000/LJ001-0159.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p225-021.png b/demo/VCTK_v0.2.1/900000/p225-021.png new file mode 100644 index 0000000..ba68e98 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p225-021.png differ diff --git a/demo/VCTK_v0.2.1/900000/p225-021.wav b/demo/VCTK_v0.2.1/900000/p225-021.wav new file mode 100644 index 0000000..b5538b2 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p225-021.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p226-351.png b/demo/VCTK_v0.2.1/900000/p226-351.png new file mode 100644 index 0000000..506170f Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p226-351.png differ diff --git a/demo/VCTK_v0.2.1/900000/p226-351.wav b/demo/VCTK_v0.2.1/900000/p226-351.wav new file mode 100644 index 0000000..4f3f4d8 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p226-351.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p232-197.png b/demo/VCTK_v0.2.1/900000/p232-197.png new file mode 100644 index 0000000..d255de1 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p232-197.png differ diff --git a/demo/VCTK_v0.2.1/900000/p232-197.wav b/demo/VCTK_v0.2.1/900000/p232-197.wav new file mode 100644 index 0000000..9c08d49 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p232-197.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p236-148.png b/demo/VCTK_v0.2.1/900000/p236-148.png new file mode 100644 index 0000000..d78b4c7 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p236-148.png differ diff --git a/demo/VCTK_v0.2.1/900000/p236-148.wav b/demo/VCTK_v0.2.1/900000/p236-148.wav new file mode 100644 index 0000000..b75b0cf Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p236-148.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p269-040.png b/demo/VCTK_v0.2.1/900000/p269-040.png new file mode 100644 index 0000000..a8871cb Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p269-040.png differ diff --git a/demo/VCTK_v0.2.1/900000/p269-040.wav b/demo/VCTK_v0.2.1/900000/p269-040.wav new file mode 100644 index 0000000..cc1b7b9 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p269-040.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p285-400.png b/demo/VCTK_v0.2.1/900000/p285-400.png new file mode 100644 index 0000000..f5fe2b4 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p285-400.png differ diff --git a/demo/VCTK_v0.2.1/900000/p285-400.wav b/demo/VCTK_v0.2.1/900000/p285-400.wav new file mode 100644 index 0000000..1d44b67 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p285-400.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p304-027.png b/demo/VCTK_v0.2.1/900000/p304-027.png new file mode 100644 index 0000000..a1b43ca Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p304-027.png differ diff --git a/demo/VCTK_v0.2.1/900000/p304-027.wav b/demo/VCTK_v0.2.1/900000/p304-027.wav new file mode 100644 index 0000000..b4953f6 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p304-027.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p317-019.png b/demo/VCTK_v0.2.1/900000/p317-019.png new file mode 100644 index 0000000..3718594 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p317-019.png differ diff --git a/demo/VCTK_v0.2.1/900000/p317-019.wav b/demo/VCTK_v0.2.1/900000/p317-019.wav new file mode 100644 index 0000000..f2d39c7 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p317-019.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p334-182.png b/demo/VCTK_v0.2.1/900000/p334-182.png new file mode 100644 index 0000000..8afe2dd Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p334-182.png differ diff --git a/demo/VCTK_v0.2.1/900000/p334-182.wav b/demo/VCTK_v0.2.1/900000/p334-182.wav new file mode 100644 index 0000000..b994f02 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p334-182.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p345-158.png b/demo/VCTK_v0.2.1/900000/p345-158.png new file mode 100644 index 0000000..53999db Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p345-158.png differ diff --git a/demo/VCTK_v0.2.1/900000/p345-158.wav b/demo/VCTK_v0.2.1/900000/p345-158.wav new file mode 100644 index 0000000..b19cef1 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p345-158.wav differ diff --git a/demo/VCTK_v0.2.1/900000/p361-227.png b/demo/VCTK_v0.2.1/900000/p361-227.png new file mode 100644 index 0000000..b384cdd Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p361-227.png differ diff --git a/demo/VCTK_v0.2.1/900000/p361-227.wav b/demo/VCTK_v0.2.1/900000/p361-227.wav new file mode 100644 index 0000000..5cdb0de Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/p361-227.wav differ diff --git a/demo/VCTK_v0.2.1/900000/s5-360.png b/demo/VCTK_v0.2.1/900000/s5-360.png new file mode 100644 index 0000000..5045c8c Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/s5-360.png differ diff --git a/demo/VCTK_v0.2.1/900000/s5-360.wav b/demo/VCTK_v0.2.1/900000/s5-360.wav new file mode 100644 index 0000000..2accd86 Binary files /dev/null and b/demo/VCTK_v0.2.1/900000/s5-360.wav differ diff --git a/img/tensorboard_audio_ljs.png b/img/tensorboard_audio_ljs.png index 1fcad0e..92eae03 100644 Binary files a/img/tensorboard_audio_ljs.png and b/img/tensorboard_audio_ljs.png differ diff --git a/img/tensorboard_audio_vctk.png b/img/tensorboard_audio_vctk.png index 067f51e..7275f1a 100644 Binary files a/img/tensorboard_audio_vctk.png and b/img/tensorboard_audio_vctk.png differ diff --git a/img/tensorboard_loss_ljs.png b/img/tensorboard_loss_ljs.png index d1c1b24..6d72946 100644 Binary files a/img/tensorboard_loss_ljs.png and b/img/tensorboard_loss_ljs.png differ diff --git a/img/tensorboard_loss_ljs_comparison.png b/img/tensorboard_loss_ljs_comparison.png new file mode 100644 index 0000000..e88acf1 Binary files /dev/null and b/img/tensorboard_loss_ljs_comparison.png differ diff --git a/img/tensorboard_loss_vctk.png b/img/tensorboard_loss_vctk.png index 2b492c9..c0bfd06 100644 Binary files a/img/tensorboard_loss_vctk.png and b/img/tensorboard_loss_vctk.png differ diff --git a/img/tensorboard_spec_ljs.png b/img/tensorboard_spec_ljs.png index cbfbf78..1360024 100644 Binary files a/img/tensorboard_spec_ljs.png and b/img/tensorboard_spec_ljs.png differ diff --git a/img/tensorboard_spec_vctk.png b/img/tensorboard_spec_vctk.png index 1c1fb96..c08bbbf 100644 Binary files a/img/tensorboard_spec_vctk.png and b/img/tensorboard_spec_vctk.png differ diff --git a/model/modules.py b/model/modules.py index 3432da6..1bb35c5 100644 --- a/model/modules.py +++ b/model/modules.py @@ -12,6 +12,7 @@ from utils.tools import ( get_variance_level, + get_phoneme_level_pitch, get_phoneme_level_energy, get_mask_from_lengths, pad_1D, @@ -870,6 +871,14 @@ def binarize_attention_parallel(self, attn, in_lens, out_lens): attn_out = b_mas(attn_cpu, in_lens.cpu().numpy(), out_lens.cpu().numpy(), width=1) return torch.from_numpy(attn_out).to(attn.device) + def get_phoneme_level_pitch(self, phone, src_len, mel2ph, mel_len, pitch_frame): + return torch.from_numpy( + pad_1D( + [get_phoneme_level_pitch(ph[:s_len], m2ph[:m_len], var[:m_len]) for ph, s_len, m2ph, m_len, var \ + in zip(phone.int().cpu().numpy(), src_len.cpu().numpy(), mel2ph.cpu().numpy(), mel_len.cpu().numpy(), pitch_frame.cpu().numpy())] + ) + ).float().to(pitch_frame.device) + def get_phoneme_level_energy(self, duration, src_len, energy_frame): return torch.from_numpy( pad_1D( @@ -972,7 +981,7 @@ def forward( ): pitch_prediction = energy_prediction = prosody_info = None - x = text + x = text.clone() if speaker_embedding is not None: x = x + speaker_embedding.unsqueeze(1).expand( -1, text.shape[1], -1 @@ -1032,17 +1041,8 @@ def forward( attn_hard_dur = attn_hard.sum(2)[:, 0, :] attn_out = (attn_soft, attn_hard, attn_hard_dur, attn_logprob) - # Note that there is no pre-extracted phoneme-level variance features in unsupervised duration modeling. - # Alternatively, we can use attn_hard_dur instead of duration_target for computing phoneme-level variances. - output_1 = x.clone() - if self.use_energy_embed and self.energy_feature_level == "phoneme_level": - if attn_prior is not None: - energy_target = self.get_phoneme_level_energy(attn_hard_dur, src_len, energy_target) - energy_prediction, energy_embedding = self.get_energy_embedding(x, energy_target, src_mask, e_control) - output_1 = output_1 + energy_embedding - x = output_1.clone() - # Upsampling from src length to mel length + x_org = x.clone() if attn_prior is not None: # Trainig of unsupervised duration modeling if step < self.binarization_start_steps: A_soft = attn_soft.squeeze(1) @@ -1065,7 +1065,9 @@ def forward( mel_mask = get_mask_from_lengths(mel_len) mel2ph = dur_to_mel2ph(duration_rounded, src_mask) - output_2 = x.clone() + # Note that there is no pre-extracted phoneme-level variance features in unsupervised duration modeling. + # Alternatively, we can use attn_hard_dur instead of duration_target for computing phoneme-level variances. + x_temp = x.clone() if self.use_pitch_embed: if pitch_target is not None: mel2ph = pitch_target["mel2ph"] @@ -1077,18 +1079,25 @@ def forward( cwt_spec, f0_mean, f0_std, mel2ph, self.preprocess_config["preprocessing"]["pitch"], ) pitch_target.update({"f0_cwt": pitch_target["f0"]}) + if self.pitch_type == "ph": + pitch_target["f0"] = self.get_phoneme_level_pitch(text, src_len, mel2ph, mel_len, pitch_target["f0"]) pitch_prediction, pitch_embedding = self.get_pitch_embedding( - x, pitch_target["f0"], pitch_target["uv"], mel2ph, p_control, encoder_out=output_1 + x, pitch_target["f0"], pitch_target["uv"], mel2ph, p_control, encoder_out=x_org ) else: pitch_prediction, pitch_embedding = self.get_pitch_embedding( - x, None, None, mel2ph, p_control, encoder_out=output_1 + x, None, None, mel2ph, p_control, encoder_out=x_org ) - output_2 = output_2 + pitch_embedding + x_temp = x_temp + pitch_embedding if self.use_energy_embed and self.energy_feature_level == "frame_level": energy_prediction, energy_embedding = self.get_energy_embedding(x, energy_target, mel_mask, e_control) - output_2 = output_2 + energy_embedding - x = output_2.clone() + x_temp = x_temp + energy_embedding + elif self.use_energy_embed and self.energy_feature_level == "phoneme_level": + if attn_prior is not None: + energy_target = self.get_phoneme_level_energy(attn_hard_dur, src_len, energy_target) + energy_prediction, energy_embedding = self.get_energy_embedding(x_org, energy_target, src_mask, e_control) + x_temp = x_temp + self.length_regulator(energy_embedding, duration_rounded, max_len)[0] + x = x_temp.clone() return ( x, diff --git a/utils/tools.py b/utils/tools.py index 5305221..1d36386 100644 --- a/utils/tools.py +++ b/utils/tools.py @@ -44,6 +44,15 @@ def get_variance_level(preprocess_config, model_config, data_loading=True): return energy_level_tag, energy_feature_level +def get_phoneme_level_pitch(phone, mel2ph, pitch): + pitch_phlevel_sum = torch.zeros(phone.shape[:-1]).float().scatter_add( + 0, torch.from_numpy(mel2ph).long() - 1, torch.from_numpy(pitch).float()) + pitch_phlevel_num = torch.zeros(phone.shape[:-1]).float().scatter_add( + 0, torch.from_numpy(mel2ph).long() - 1, torch.ones(pitch.shape)).clamp_min(1) + pitch = (pitch_phlevel_sum / pitch_phlevel_num).numpy() + return pitch + + def get_phoneme_level_energy(duration, energy): # Phoneme-level average pos = 0