Initial commit

huutuongtu · May 10, 2024 · ffd835e · ffd835e
commit ffd835e
Show file tree

Hide file tree

Showing 21 changed files with 14,666 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/LJSpeech-1.1/training.txt b/LJSpeech-1.1/training.txt
diff --git a/LJSpeech-1.1/validation.txt b/LJSpeech-1.1/validation.txt
diff --git a/README.md b/README.md
@@ -0,0 +1,22 @@
+# LIGHTVOC. AN UPSAMPLING-FREE GAN VOCODER BASED ON CONFORMER AND INVERSE SHORT-TIME FOURIER TRANSFORM
+This repo try to implement [LIGHTVOC. AN UPSAMPLING-FREE GAN VOCODER BASED ON CONFORMER AND INVERSE SHORT-TIME FOURIER TRANSFORM]
+
+## Training :
+```
+python train_lightvoc.py --config config_v1.json
+```
+
+## Citations :
+```
+@article{Dang2023LightVocAU,
+  title={LightVoc: An Upsampling-Free GAN Vocoder Based On Conformer And Inverse Short-time Fourier Transform},
+  author={Dinh Son Dang and Tung Lam Nguyen and Bao Thang Ta and Tien Thanh Nguyen and Thi Ngoc Anh Nguyen and Dan Linh Le and Nhat Minh Le and Van Hai Do},
+  journal={INTERSPEECH 2023},
+  year={2023},
+  url={https://api.semanticscholar.org/CorpusID:260916503}
+}
+```
+
+## References:
+* https://github.com/jik876/hifi-gan
+* https://github.com/rishikksh20/iSTFTNet-pytorch
diff --git a/config_v1.json b/config_v1.json
@@ -0,0 +1,38 @@
+{
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+
+    "upsample_rates": [8,8],
+    "upsample_kernel_sizes": [16,16],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "gen_istft_n_fft": 1024,
+    "gen_istft_hop_size": 256,
+
+    "segment_size": 8192,
+    "num_mels": 80,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+
+    "sampling_rate": 22050,
+
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_for_loss": null,
+
+    "num_workers": 4,
+
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:6000",
+        "world_size": 1
+    }
+}
diff --git a/conformer.py b/conformer.py
@@ -0,0 +1,203 @@
+from typing import Optional, Tuple
+
+import torch
+
+"""
+From pytorch implementation
+"""
+
+__all__ = ["Conformer"]
+
+
+def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor:
+    batch_size = lengths.shape[0]
+    max_length = int(torch.max(lengths).item())
+    padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand(
+        batch_size, max_length
+    ) >= lengths.unsqueeze(1)
+    return padding_mask
+
+
+class _ConvolutionModule(torch.nn.Module):
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_channels: int,
+        depthwise_kernel_size: int,
+        dropout: float = 0.0,
+        bias: bool = False,
+        use_group_norm: bool = False,
+    ) -> None:
+        super().__init__()
+        if (depthwise_kernel_size - 1) % 2 != 0:
+            raise ValueError("depthwise_kernel_size must be odd to achieve 'SAME' padding.")
+        self.layer_norm = torch.nn.LayerNorm(input_dim)
+        self.sequential = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                input_dim,
+                2 * num_channels,
+                1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            ),
+            torch.nn.GLU(dim=1),
+            torch.nn.Conv1d(
+                num_channels,
+                num_channels,
+                depthwise_kernel_size,
+                stride=1,
+                padding=(depthwise_kernel_size - 1) // 2,
+                groups=num_channels,
+                bias=bias,
+            ),
+            torch.nn.GroupNorm(num_groups=1, num_channels=num_channels)
+            if use_group_norm
+            else torch.nn.BatchNorm1d(num_channels),
+            torch.nn.SiLU(),
+            torch.nn.Conv1d(
+                num_channels,
+                input_dim,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            ),
+            torch.nn.Dropout(dropout),
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+
+        x = self.layer_norm(input)
+        x = x.transpose(1, 2)
+        x = self.sequential(x)
+        return x.transpose(1, 2)
+
+
+class _FeedForwardModule(torch.nn.Module):
+
+    def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None:
+        super().__init__()
+        self.sequential = torch.nn.Sequential(
+            torch.nn.LayerNorm(input_dim),
+            torch.nn.Linear(input_dim, hidden_dim, bias=True),
+            torch.nn.SiLU(),
+            torch.nn.Dropout(dropout),
+            torch.nn.Linear(hidden_dim, input_dim, bias=True),
+            torch.nn.Dropout(dropout),
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return self.sequential(input)
+
+
+class ConformerLayer(torch.nn.Module):
+
+    def __init__(
+        self,
+        input_dim: int,
+        ffn_dim: int,
+        num_attention_heads: int,
+        depthwise_conv_kernel_size: int,
+        dropout: float = 0.0,
+        use_group_norm: bool = False,
+        convolution_first: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout)
+
+        self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim)
+        self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout)
+        self.self_attn_dropout = torch.nn.Dropout(dropout)
+
+        self.conv_module = _ConvolutionModule(
+            input_dim=input_dim,
+            num_channels=input_dim,
+            depthwise_kernel_size=depthwise_conv_kernel_size,
+            dropout=dropout,
+            bias=True,
+            use_group_norm=use_group_norm,
+        )
+
+        self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout)
+        self.final_layer_norm = torch.nn.LayerNorm(input_dim)
+        self.convolution_first = convolution_first
+
+    def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor:
+        residual = input
+        input = input.transpose(0, 1)
+        input = self.conv_module(input)
+        input = input.transpose(0, 1)
+        input = residual + input
+        return input
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+
+        residual = input
+        x = self.ffn1(input)
+        x = x * 0.5 + residual
+
+        if self.convolution_first:
+            x = self._apply_convolution(x)
+
+        residual = x
+        x = self.self_attn_layer_norm(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            need_weights=False,
+        )
+        x = self.self_attn_dropout(x)
+        x = x + residual
+
+        if not self.convolution_first:
+            x = self._apply_convolution(x)
+
+        residual = x
+        x = self.ffn2(x)
+        x = x * 0.5 + residual
+
+        x = self.final_layer_norm(x)
+        return x
+
+class Conformer(torch.nn.Module):
+
+    def __init__(
+        self,
+        input_dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        num_layers: int,
+        depthwise_conv_kernel_size: int,
+        dropout: float = 0.0,
+        use_group_norm: bool = False,
+        convolution_first: bool = False,
+    ):
+        super().__init__()
+
+        self.conformer_layers = torch.nn.ModuleList(
+            [
+                ConformerLayer(
+                    input_dim,
+                    ffn_dim,
+                    num_heads,
+                    depthwise_conv_kernel_size,
+                    dropout=dropout,
+                    use_group_norm=use_group_norm,
+                    convolution_first=convolution_first,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+    def forward(self, input: torch.Tensor):
+
+        x = input.transpose(0, 1)
+        out = []
+        for layer in self.conformer_layers:
+            x = layer(x)
+            out.append(x.transpose(0, 1))
+        return out
diff --git a/env.py b/env.py
@@ -0,0 +1,15 @@
+import os
+import shutil
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Auto detect text files and perform LF normalization
		* text=auto