diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6769e21 --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/README.md b/README.md index bdbd2d9..2266a35 100644 --- a/README.md +++ b/README.md @@ -11,14 +11,17 @@ from various objective and subjective evaluation metrics. Sound demos can be fou ## Datasets -- [Microsoft DNS 2020](https://arxiv.org/ftp/arxiv/papers/2005/2005.13981.pdf) dataset. The dataset, pre-processing codes, and instruction to generate training data can be found in [this link](https://github.com/microsoft/DNS-Challenge/tree/interspeech2020/master). Assume the dataset is stored under ```./dns```. Before generating clean-noisy data pairs, modify the following parameters in their ```noisyspeech_synthesizer.cfg``` file: +- [Microsoft DNS 2020](https://arxiv.org/ftp/arxiv/papers/2005/2005.13981.pdf) dataset. The dataset, pre-processing codes, and instruction to generate training data can be found in [this link](https://github.com/microsoft/DNS-Challenge/tree/interspeech2020/master). Assume the dataset is stored under ``./dns``. Before generating clean-noisy data pairs, modify the following parameters in their ``noisyspeech_synthesizer.cfg`` file: + ``` total_hours: 500, snr_lower: -5, snr_upper: 25, total_snrlevels: 31 ``` + And also update paths as (since their original code uses Windows-style paths) + ``` noise_dir: ./datasets/noise speech_dir: ./datasets/clean @@ -28,23 +31,30 @@ noise_destination: ./training_set/noise log_dir: ./logs unit_tests_log_dir: ./unittests_logs ``` -Then, for conciseness and to comply with our data loading codes, modify file names (lines 198-201) in their ```noisyspeech_synthesizer_singleprocess.py``` to + +Then, for conciseness and to comply with our data loading codes, modify file names (lines 198-201) in their ``noisyspeech_synthesizer_singleprocess.py`` to + ``` noisyfilename = 'fileid_' + str(file_num) + '.wav' cleanfilename = 'fileid_' + str(file_num) + '.wav' noisefilename = 'fileid_' + str(file_num) + '.wav' ``` -To generate training data, run + +To generate training data, run + ``` python noisyspeech_synthesizer_singleprocess.py ``` + It is also recommended to rename files in the test set for conciseness: + ``` cd ./dns/datasets/test_set/synthetic/no_reverb/noisy/ for NAME in $(ls ./); do arr=(${NAME//fileid_/ }); mv ${NAME} noisy_fileid_${arr[1]}; done ``` After these steps, we assume that the structure of the dataset folder is: + ``` Training sets: ./dns/training_set/clean/fileid_{0..59999}.wav @@ -56,35 +66,43 @@ Testing sets (no-reverb): ./dns/datasets/test_set/synthetic/no_reverb/noisy/noisy_fileid_{0..299}.wav ``` -- Other datasets are also supported; lines 49-50 of ```dataset.py``` need to be carefully changed to handle paths and file names. +- Other datasets are also supported; lines 49-50 of ``dataset.py`` need to be carefully changed to handle paths and file names. ## Training -The ```$EXP``` variable can be any config name in ```./configs/```, such as ```DNS-large-full``` and ```DNS-large-high```. The default experiment path is ```./exp```; it can be changed by modifying ```train_config[log[directory]]``` in the config files. ```trainset_config[root]``` needs to be set as the root path of the dataset. Then, the training code is +The ``$EXP`` variable can be any config name in ``./configs/``, such as ``DNS-large-full`` and ``DNS-large-high``. The default experiment path is ``./exp``; it can be changed by modifying ``train_config[log[directory]]`` in the config files. ``trainset_config[root]`` needs to be set as the root path of the dataset. Then, the training code is -```python3 distributed.py -c configs/${EXP}.json``` +``python3 distributed.py -c configs/${EXP}.json`` We use 8 GPUs for training. The global batch size is 64 and we train the models for 250K iterations. Note that, this is different from the training setup in our paper i.e., 1M iterations with a batch size of 16. We find negligible difference in terms of objective and subjective evaluation, but the current setup is faster. -**Pre-trained** models for denoising are provided in ```./exp/${EXP}/checkpoint/pretrained.pkl``` (each one has size ~177Mb; use ```git lfs``` to download). Note that these models are not trained to remove reverb. +**Pre-trained** models for denoising are provided in ``./exp/${EXP}/checkpoint/pretrained.pkl`` (each one has size ~177Mb; use ``git lfs`` to download). Note that these models are not trained to remove reverb. ## Denoising -We perform denoising on the DNS no-reverb test dataset. The output path is ```gen_config[output_directory]```, which is ```./exp``` by default. The denoising code is +We perform denoising on the DNS no-reverb test dataset. The output path is ``gen_config[output_directory]``, which is ``./exp`` by default. The denoising code is -```python denoise.py -c configs/${EXP}.json --ckpt_iter ${ITERATION}``` +``python denoise.py -c configs/${EXP}.json --ckpt_iter ${ITERATION}`` For example, if you want to use pre-trained models to denoise, run: -```python denoise.py -c configs/DNS-large-high.json --ckpt_iter pretrained``` +``python denoise.py -c configs/DNS-large-high.json --ckpt_iter pretrained`` 1 GPU is used for denoising. +## Simple Denoising without dataset + +There is a script `simple_infrence.py` to denoise single or multiple files. It also does automatic fp16 conversion. You can also adjust batch size to fit your vram. + +```denoise_simple.py python denoise.py -c configs/${EXP}.json --ckpt_path ${file.plk} -b 1000000 ````` + +``` python denoise_simple.py -c configs/DNS-large-high.json --ckpt_pat ./exp/DNS-large-high/checkpoint/pretrained.pkl ./test.mp4``` + ## Evaluation The following evaluation code generates [PESQ](https://www.itu.int/rec/T-REC-P.862) and [STOI](https://ceestaal.nl/code/) scores. More evaluation metrics can be found in the [SEGAN (PyTorch)](https://github.com/santi-pdp/segan_pytorch) repo. -```python python_eval.py -d dns -e ${PATH_TO_DENOISED_SPEECH} -t ${PATH_TO_TESTSET_PATH} >> eval.log``` +``python python_eval.py -d dns -e ${PATH_TO_DENOISED_SPEECH} -t ${PATH_TO_TESTSET_PATH} >> eval.log`` 1 GPU is used for evaluation. @@ -92,13 +110,13 @@ The following evaluation code generates [PESQ](https://www.itu.int/rec/T-REC-P.8 To synthesize [Microsoft DNS 2020](https://arxiv.org/ftp/arxiv/papers/2005/2005.13981.pdf) training data, you need [these dependencies](https://github.com/microsoft/DNS-Challenge/blob/interspeech2020/master/requirements.txt). If you just want to evaluate our pre-trained models on the test data, you may jump this. -Our code is tested on 8 NVIDIA V100 GPUs. You need to install very standard dependencies: ```numpy``` and ```scipy``` for scientific computing, ```torch, torchvision, torchaudio``` for deep learning and data loading, ```pesq, pystoi``` for audio evaluation, and ```tqdm``` for visualization. +Our code is tested on 8 NVIDIA V100 GPUs. You need to install very standard dependencies: ``numpy`` and ``scipy`` for scientific computing, ``torch, torchvision, torchaudio`` for deep learning and data loading, ``pesq, pystoi`` for audio evaluation, and ``tqdm`` for visualization. ## References -The code structure and distributed training are adapted from [WaveGlow (PyTorch)](https://github.com/NVIDIA/waveglow) (BSD-3-Clause license). The ```stft_loss.py``` is adapted from [ParallelWaveGAN (PyTorch)](https://github.com/kan-bayashi/ParallelWaveGAN) (MIT license). The self-attention blocks in ```network.py``` is adapted from [Attention is all you need (PyTorch)](https://github.com/jadore801120/attention-is-all-you-need-pytorch) (MIT license), which borrows from [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py) (MIT license). The learning rate scheduler in ```util.py``` is adapted from [VQVAE2 (PyTorch)](https://github.com/rosinality/vq-vae-2-pytorch) (MIT license). Some utility functions are borrowed from [DiffWave (PyTorch)](https://github.com/philsyn/DiffWave-Vocoder) (MIT license) and [WaveGlow (PyTorch)](https://github.com/NVIDIA/waveglow) (BSD-3-Clause license). +The code structure and distributed training are adapted from [WaveGlow (PyTorch)](https://github.com/NVIDIA/waveglow) (BSD-3-Clause license). The ``stft_loss.py`` is adapted from [ParallelWaveGAN (PyTorch)](https://github.com/kan-bayashi/ParallelWaveGAN) (MIT license). The self-attention blocks in ``network.py`` is adapted from [Attention is all you need (PyTorch)](https://github.com/jadore801120/attention-is-all-you-need-pytorch) (MIT license), which borrows from [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py) (MIT license). The learning rate scheduler in ``util.py`` is adapted from [VQVAE2 (PyTorch)](https://github.com/rosinality/vq-vae-2-pytorch) (MIT license). Some utility functions are borrowed from [DiffWave (PyTorch)](https://github.com/philsyn/DiffWave-Vocoder) (MIT license) and [WaveGlow (PyTorch)](https://github.com/NVIDIA/waveglow) (BSD-3-Clause license). -For more evaluation methods, we refer readers to look at [SEGAN (PyTorch)](https://github.com/santi-pdp/segan_pytorch/blob/master/segan/utils.py) (MIT license). For more data augmentation methods, we refer readers to look at [FAIR-denoiser](https://github.com/facebookresearch/denoiser/blob/main/denoiser/augment.py) (CC-BY-NC 4.0 license). +For more evaluation methods, we refer readers to look at [SEGAN (PyTorch)](https://github.com/santi-pdp/segan_pytorch/blob/master/segan/utils.py) (MIT license). For more data augmentation methods, we refer readers to look at [FAIR-denoiser](https://github.com/facebookresearch/denoiser/blob/main/denoiser/augment.py) (CC-BY-NC 4.0 license). ## Citation diff --git a/denoise_simple.py b/denoise_simple.py new file mode 100644 index 0000000..8d50f0b --- /dev/null +++ b/denoise_simple.py @@ -0,0 +1,147 @@ +# Adapted from https://github.com/NVIDIA/waveglow under the BSD 3-Clause License. + +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** + +import os +import argparse +import json +from tqdm import tqdm +from copy import deepcopy + +import numpy as np +import torch +import torch.nn as nn +# from torch.utils.tensorboard import SummaryWriter + +import random +random.seed(0) +torch.manual_seed(0) +np.random.seed(0) + +from scipy.io.wavfile import write as wavwrite + +from dataset import load_CleanNoisyPairDataset +from util import rescale, find_max_epoch, print_size, sampling +from network import CleanUNet +import torchaudio + +def load_simple(filename): + audio, _ = torchaudio.load(filename) + return audio + + + +def denoise(files, ckpt_path, batch_size): + """ + Denoise audio + + Parameters: + output_directory (str): save generated speeches to this path + ckpt_iter (int or 'max'): the pretrained checkpoint to be loaded; + automitically selects the maximum iteration if 'max' is selected + subset (str): training, testing, validation + dump (bool): whether save enhanced (denoised) audio + """ + + # setup local experiment path + exp_path = train_config["exp_path"] + print('exp_path:', exp_path) + + # load data + loader_config = deepcopy(trainset_config) + loader_config["crop_length_sec"] = 0 + + + + # predefine model + net = CleanUNet(**network_config).cuda() + print_size(net) + + # load checkpoint + checkpoint = torch.load(ckpt_path, map_location='cpu') + net.load_state_dict(checkpoint['model_state_dict']) + net.eval() + + # inference + for file_path in tqdm(files): + file_name = os.path.basename(file_path) + file_dir = os.path.dirname(file_name) + new_file_name = file_name + "_denoised.wav" + noisy_audio = load_simple(file_path).cuda() + LENGTH = len(noisy_audio[0].squeeze()) + noisy_audio = torch.chunk(noisy_audio, LENGTH // batch_size + 1, dim=1) + all_audio = [] + + for batch in tqdm(noisy_audio): + with torch.no_grad(): + with torch.cuda.amp.autocast(): + generated_audio = sampling(net, batch) + generated_audio = generated_audio.cpu().numpy().squeeze() + all_audio.append(generated_audio) + + all_audio = np.concatenate(all_audio, axis=0) + save_file = os.path.join(file_dir, new_file_name) + print("saved to:", save_file) + wavwrite(save_file, + 32000, + all_audio.squeeze()) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, default='config.json', + help='JSON file for configuration') + parser.add_argument('-ckpt_path', '--ckpt_path', + help='Path to the checkpoint you want to use') + parser.add_argument('-b', '--batch_size', type=int, help='chunk your input audio vector into chunks of batch_size. not exact.', default=100_000) + + parser.add_argument('files', nargs=argparse.REMAINDER) + + args = parser.parse_args() + + # Parse configs. Globals nicer in this case + with open(args.config) as f: + data = f.read() + config = json.loads(data) + gen_config = config["gen_config"] + global network_config + network_config = config["network_config"] # to define wavenet + global train_config + train_config = config["train_config"] # train config + global trainset_config + trainset_config = config["trainset_config"] # to read trainset configurations + files = args.files + bs = args.batch_size + + + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = True + + denoise(files,args.ckpt_path, batch_size=bs) + diff --git a/network.py b/network.py index 68f0013..59696bd 100644 --- a/network.py +++ b/network.py @@ -27,7 +27,8 @@ def forward(self, q, k, v, mask=None): attn = torch.matmul(q / self.temperature, k.transpose(2, 3)) if mask is not None: - attn = attn.masked_fill(mask == 0, -1e9) + _MASKING_VALUE = -1e9 if attn.dtype == torch.float32 else -1e4 + attn = attn.masked_fill(mask == 0, _MASKING_VALUE) attn = self.dropout(F.softmax(attn, dim=-1)) output = torch.matmul(attn, v)