Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
huutuongtu committed May 10, 2024
0 parents commit ffd835e
Show file tree
Hide file tree
Showing 21 changed files with 14,666 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Auto detect text files and perform LF normalization
* text=auto
12,950 changes: 12,950 additions & 0 deletions LJSpeech-1.1/training.txt

Large diffs are not rendered by default.

150 changes: 150 additions & 0 deletions LJSpeech-1.1/validation.txt

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# LIGHTVOC. AN UPSAMPLING-FREE GAN VOCODER BASED ON CONFORMER AND INVERSE SHORT-TIME FOURIER TRANSFORM
This repo try to implement [LIGHTVOC. AN UPSAMPLING-FREE GAN VOCODER BASED ON CONFORMER AND INVERSE SHORT-TIME FOURIER TRANSFORM]

## Training :
```
python train_lightvoc.py --config config_v1.json
```

## Citations :
```
@article{Dang2023LightVocAU,
title={LightVoc: An Upsampling-Free GAN Vocoder Based On Conformer And Inverse Short-time Fourier Transform},
author={Dinh Son Dang and Tung Lam Nguyen and Bao Thang Ta and Tien Thanh Nguyen and Thi Ngoc Anh Nguyen and Dan Linh Le and Nhat Minh Le and Van Hai Do},
journal={INTERSPEECH 2023},
year={2023},
url={https://api.semanticscholar.org/CorpusID:260916503}
}
```

## References:
* https://github.com/jik876/hifi-gan
* https://github.com/rishikksh20/iSTFTNet-pytorch
38 changes: 38 additions & 0 deletions config_v1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 16,
"learning_rate": 0.0002,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.999,
"seed": 1234,

"upsample_rates": [8,8],
"upsample_kernel_sizes": [16,16],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"gen_istft_n_fft": 1024,
"gen_istft_hop_size": 256,

"segment_size": 8192,
"num_mels": 80,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,

"sampling_rate": 22050,

"fmin": 0,
"fmax": 8000,
"fmax_for_loss": null,

"num_workers": 4,

"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:6000",
"world_size": 1
}
}
203 changes: 203 additions & 0 deletions conformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
from typing import Optional, Tuple

import torch

"""
From pytorch implementation
"""

__all__ = ["Conformer"]


def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor:
batch_size = lengths.shape[0]
max_length = int(torch.max(lengths).item())
padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand(
batch_size, max_length
) >= lengths.unsqueeze(1)
return padding_mask


class _ConvolutionModule(torch.nn.Module):

def __init__(
self,
input_dim: int,
num_channels: int,
depthwise_kernel_size: int,
dropout: float = 0.0,
bias: bool = False,
use_group_norm: bool = False,
) -> None:
super().__init__()
if (depthwise_kernel_size - 1) % 2 != 0:
raise ValueError("depthwise_kernel_size must be odd to achieve 'SAME' padding.")
self.layer_norm = torch.nn.LayerNorm(input_dim)
self.sequential = torch.nn.Sequential(
torch.nn.Conv1d(
input_dim,
2 * num_channels,
1,
stride=1,
padding=0,
bias=bias,
),
torch.nn.GLU(dim=1),
torch.nn.Conv1d(
num_channels,
num_channels,
depthwise_kernel_size,
stride=1,
padding=(depthwise_kernel_size - 1) // 2,
groups=num_channels,
bias=bias,
),
torch.nn.GroupNorm(num_groups=1, num_channels=num_channels)
if use_group_norm
else torch.nn.BatchNorm1d(num_channels),
torch.nn.SiLU(),
torch.nn.Conv1d(
num_channels,
input_dim,
kernel_size=1,
stride=1,
padding=0,
bias=bias,
),
torch.nn.Dropout(dropout),
)

def forward(self, input: torch.Tensor) -> torch.Tensor:

x = self.layer_norm(input)
x = x.transpose(1, 2)
x = self.sequential(x)
return x.transpose(1, 2)


class _FeedForwardModule(torch.nn.Module):

def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None:
super().__init__()
self.sequential = torch.nn.Sequential(
torch.nn.LayerNorm(input_dim),
torch.nn.Linear(input_dim, hidden_dim, bias=True),
torch.nn.SiLU(),
torch.nn.Dropout(dropout),
torch.nn.Linear(hidden_dim, input_dim, bias=True),
torch.nn.Dropout(dropout),
)

def forward(self, input: torch.Tensor) -> torch.Tensor:
return self.sequential(input)


class ConformerLayer(torch.nn.Module):

def __init__(
self,
input_dim: int,
ffn_dim: int,
num_attention_heads: int,
depthwise_conv_kernel_size: int,
dropout: float = 0.0,
use_group_norm: bool = False,
convolution_first: bool = False,
) -> None:
super().__init__()

self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout)

self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim)
self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout)
self.self_attn_dropout = torch.nn.Dropout(dropout)

self.conv_module = _ConvolutionModule(
input_dim=input_dim,
num_channels=input_dim,
depthwise_kernel_size=depthwise_conv_kernel_size,
dropout=dropout,
bias=True,
use_group_norm=use_group_norm,
)

self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout)
self.final_layer_norm = torch.nn.LayerNorm(input_dim)
self.convolution_first = convolution_first

def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor:
residual = input
input = input.transpose(0, 1)
input = self.conv_module(input)
input = input.transpose(0, 1)
input = residual + input
return input

def forward(self, input: torch.Tensor) -> torch.Tensor:

residual = input
x = self.ffn1(input)
x = x * 0.5 + residual

if self.convolution_first:
x = self._apply_convolution(x)

residual = x
x = self.self_attn_layer_norm(x)
x, _ = self.self_attn(
query=x,
key=x,
value=x,
need_weights=False,
)
x = self.self_attn_dropout(x)
x = x + residual

if not self.convolution_first:
x = self._apply_convolution(x)

residual = x
x = self.ffn2(x)
x = x * 0.5 + residual

x = self.final_layer_norm(x)
return x

class Conformer(torch.nn.Module):

def __init__(
self,
input_dim: int,
num_heads: int,
ffn_dim: int,
num_layers: int,
depthwise_conv_kernel_size: int,
dropout: float = 0.0,
use_group_norm: bool = False,
convolution_first: bool = False,
):
super().__init__()

self.conformer_layers = torch.nn.ModuleList(
[
ConformerLayer(
input_dim,
ffn_dim,
num_heads,
depthwise_conv_kernel_size,
dropout=dropout,
use_group_norm=use_group_norm,
convolution_first=convolution_first,
)
for _ in range(num_layers)
]
)

def forward(self, input: torch.Tensor):

x = input.transpose(0, 1)
out = []
for layer in self.conformer_layers:
x = layer(x)
out.append(x.transpose(0, 1))
return out
15 changes: 15 additions & 0 deletions env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os
import shutil


class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self


def build_env(config, config_name, path):
t_path = os.path.join(path, config_name)
if config != t_path:
os.makedirs(path, exist_ok=True)
shutil.copyfile(config, os.path.join(path, config_name))
Loading

0 comments on commit ffd835e

Please sign in to comment.