-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit ffd835e
Showing
21 changed files
with
14,666 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Auto detect text files and perform LF normalization | ||
* text=auto |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# LIGHTVOC. AN UPSAMPLING-FREE GAN VOCODER BASED ON CONFORMER AND INVERSE SHORT-TIME FOURIER TRANSFORM | ||
This repo try to implement [LIGHTVOC. AN UPSAMPLING-FREE GAN VOCODER BASED ON CONFORMER AND INVERSE SHORT-TIME FOURIER TRANSFORM] | ||
|
||
## Training : | ||
``` | ||
python train_lightvoc.py --config config_v1.json | ||
``` | ||
|
||
## Citations : | ||
``` | ||
@article{Dang2023LightVocAU, | ||
title={LightVoc: An Upsampling-Free GAN Vocoder Based On Conformer And Inverse Short-time Fourier Transform}, | ||
author={Dinh Son Dang and Tung Lam Nguyen and Bao Thang Ta and Tien Thanh Nguyen and Thi Ngoc Anh Nguyen and Dan Linh Le and Nhat Minh Le and Van Hai Do}, | ||
journal={INTERSPEECH 2023}, | ||
year={2023}, | ||
url={https://api.semanticscholar.org/CorpusID:260916503} | ||
} | ||
``` | ||
|
||
## References: | ||
* https://github.com/jik876/hifi-gan | ||
* https://github.com/rishikksh20/iSTFTNet-pytorch |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
{ | ||
"resblock": "1", | ||
"num_gpus": 0, | ||
"batch_size": 16, | ||
"learning_rate": 0.0002, | ||
"adam_b1": 0.8, | ||
"adam_b2": 0.99, | ||
"lr_decay": 0.999, | ||
"seed": 1234, | ||
|
||
"upsample_rates": [8,8], | ||
"upsample_kernel_sizes": [16,16], | ||
"upsample_initial_channel": 512, | ||
"resblock_kernel_sizes": [3,7,11], | ||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], | ||
"gen_istft_n_fft": 1024, | ||
"gen_istft_hop_size": 256, | ||
|
||
"segment_size": 8192, | ||
"num_mels": 80, | ||
"n_fft": 1024, | ||
"hop_size": 256, | ||
"win_size": 1024, | ||
|
||
"sampling_rate": 22050, | ||
|
||
"fmin": 0, | ||
"fmax": 8000, | ||
"fmax_for_loss": null, | ||
|
||
"num_workers": 4, | ||
|
||
"dist_config": { | ||
"dist_backend": "nccl", | ||
"dist_url": "tcp://localhost:6000", | ||
"world_size": 1 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
from typing import Optional, Tuple | ||
|
||
import torch | ||
|
||
""" | ||
From pytorch implementation | ||
""" | ||
|
||
__all__ = ["Conformer"] | ||
|
||
|
||
def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor: | ||
batch_size = lengths.shape[0] | ||
max_length = int(torch.max(lengths).item()) | ||
padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand( | ||
batch_size, max_length | ||
) >= lengths.unsqueeze(1) | ||
return padding_mask | ||
|
||
|
||
class _ConvolutionModule(torch.nn.Module): | ||
|
||
def __init__( | ||
self, | ||
input_dim: int, | ||
num_channels: int, | ||
depthwise_kernel_size: int, | ||
dropout: float = 0.0, | ||
bias: bool = False, | ||
use_group_norm: bool = False, | ||
) -> None: | ||
super().__init__() | ||
if (depthwise_kernel_size - 1) % 2 != 0: | ||
raise ValueError("depthwise_kernel_size must be odd to achieve 'SAME' padding.") | ||
self.layer_norm = torch.nn.LayerNorm(input_dim) | ||
self.sequential = torch.nn.Sequential( | ||
torch.nn.Conv1d( | ||
input_dim, | ||
2 * num_channels, | ||
1, | ||
stride=1, | ||
padding=0, | ||
bias=bias, | ||
), | ||
torch.nn.GLU(dim=1), | ||
torch.nn.Conv1d( | ||
num_channels, | ||
num_channels, | ||
depthwise_kernel_size, | ||
stride=1, | ||
padding=(depthwise_kernel_size - 1) // 2, | ||
groups=num_channels, | ||
bias=bias, | ||
), | ||
torch.nn.GroupNorm(num_groups=1, num_channels=num_channels) | ||
if use_group_norm | ||
else torch.nn.BatchNorm1d(num_channels), | ||
torch.nn.SiLU(), | ||
torch.nn.Conv1d( | ||
num_channels, | ||
input_dim, | ||
kernel_size=1, | ||
stride=1, | ||
padding=0, | ||
bias=bias, | ||
), | ||
torch.nn.Dropout(dropout), | ||
) | ||
|
||
def forward(self, input: torch.Tensor) -> torch.Tensor: | ||
|
||
x = self.layer_norm(input) | ||
x = x.transpose(1, 2) | ||
x = self.sequential(x) | ||
return x.transpose(1, 2) | ||
|
||
|
||
class _FeedForwardModule(torch.nn.Module): | ||
|
||
def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None: | ||
super().__init__() | ||
self.sequential = torch.nn.Sequential( | ||
torch.nn.LayerNorm(input_dim), | ||
torch.nn.Linear(input_dim, hidden_dim, bias=True), | ||
torch.nn.SiLU(), | ||
torch.nn.Dropout(dropout), | ||
torch.nn.Linear(hidden_dim, input_dim, bias=True), | ||
torch.nn.Dropout(dropout), | ||
) | ||
|
||
def forward(self, input: torch.Tensor) -> torch.Tensor: | ||
return self.sequential(input) | ||
|
||
|
||
class ConformerLayer(torch.nn.Module): | ||
|
||
def __init__( | ||
self, | ||
input_dim: int, | ||
ffn_dim: int, | ||
num_attention_heads: int, | ||
depthwise_conv_kernel_size: int, | ||
dropout: float = 0.0, | ||
use_group_norm: bool = False, | ||
convolution_first: bool = False, | ||
) -> None: | ||
super().__init__() | ||
|
||
self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) | ||
|
||
self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim) | ||
self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout) | ||
self.self_attn_dropout = torch.nn.Dropout(dropout) | ||
|
||
self.conv_module = _ConvolutionModule( | ||
input_dim=input_dim, | ||
num_channels=input_dim, | ||
depthwise_kernel_size=depthwise_conv_kernel_size, | ||
dropout=dropout, | ||
bias=True, | ||
use_group_norm=use_group_norm, | ||
) | ||
|
||
self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) | ||
self.final_layer_norm = torch.nn.LayerNorm(input_dim) | ||
self.convolution_first = convolution_first | ||
|
||
def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor: | ||
residual = input | ||
input = input.transpose(0, 1) | ||
input = self.conv_module(input) | ||
input = input.transpose(0, 1) | ||
input = residual + input | ||
return input | ||
|
||
def forward(self, input: torch.Tensor) -> torch.Tensor: | ||
|
||
residual = input | ||
x = self.ffn1(input) | ||
x = x * 0.5 + residual | ||
|
||
if self.convolution_first: | ||
x = self._apply_convolution(x) | ||
|
||
residual = x | ||
x = self.self_attn_layer_norm(x) | ||
x, _ = self.self_attn( | ||
query=x, | ||
key=x, | ||
value=x, | ||
need_weights=False, | ||
) | ||
x = self.self_attn_dropout(x) | ||
x = x + residual | ||
|
||
if not self.convolution_first: | ||
x = self._apply_convolution(x) | ||
|
||
residual = x | ||
x = self.ffn2(x) | ||
x = x * 0.5 + residual | ||
|
||
x = self.final_layer_norm(x) | ||
return x | ||
|
||
class Conformer(torch.nn.Module): | ||
|
||
def __init__( | ||
self, | ||
input_dim: int, | ||
num_heads: int, | ||
ffn_dim: int, | ||
num_layers: int, | ||
depthwise_conv_kernel_size: int, | ||
dropout: float = 0.0, | ||
use_group_norm: bool = False, | ||
convolution_first: bool = False, | ||
): | ||
super().__init__() | ||
|
||
self.conformer_layers = torch.nn.ModuleList( | ||
[ | ||
ConformerLayer( | ||
input_dim, | ||
ffn_dim, | ||
num_heads, | ||
depthwise_conv_kernel_size, | ||
dropout=dropout, | ||
use_group_norm=use_group_norm, | ||
convolution_first=convolution_first, | ||
) | ||
for _ in range(num_layers) | ||
] | ||
) | ||
|
||
def forward(self, input: torch.Tensor): | ||
|
||
x = input.transpose(0, 1) | ||
out = [] | ||
for layer in self.conformer_layers: | ||
x = layer(x) | ||
out.append(x.transpose(0, 1)) | ||
return out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import os | ||
import shutil | ||
|
||
|
||
class AttrDict(dict): | ||
def __init__(self, *args, **kwargs): | ||
super(AttrDict, self).__init__(*args, **kwargs) | ||
self.__dict__ = self | ||
|
||
|
||
def build_env(config, config_name, path): | ||
t_path = os.path.join(path, config_name) | ||
if config != t_path: | ||
os.makedirs(path, exist_ok=True) | ||
shutil.copyfile(config, os.path.join(path, config_name)) |
Oops, something went wrong.