diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh
index 8859b827f0..9170f45a01 100755
--- a/.github/scripts/unittest-linux/install.sh
+++ b/.github/scripts/unittest-linux/install.sh
@@ -74,7 +74,7 @@ case $GPU_ARCH_TYPE in
;;
esac
PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}"
-pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}"
+pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}"
# 2. Install torchaudio
@@ -85,6 +85,9 @@ export BUILD_CPP_TEST=1
python setup.py install
# 3. Install Test tools
+conda install -y "ffmpeg<5"
+python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)"
+
printf "* Installing test tools\n"
NUMBA_DEV_CHANNEL=""
if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then
@@ -94,7 +97,7 @@ if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then
fi
(
set -x
- conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} sox libvorbis parameterized 'requests>=2.20' 'ffmpeg>=6,<7'
+ conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} sox libvorbis parameterized 'requests>=2.20'
pip install kaldi-io SoundFile librosa coverage pytest pytest-cov scipy expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm
# TODO: might be better to fix the single call to `pip install` above
diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml
index e92c556218..f681e3b7ec 100644
--- a/.github/workflows/build_docs.yml
+++ b/.github/workflows/build_docs.yml
@@ -68,7 +68,7 @@ jobs:
GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version.
PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}"
- pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}"
+ pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}"
echo "::endgroup::"
echo "::group::Install TorchAudio"
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 8522161f40..485690e036 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,6 +1,7 @@
Jinja2<3.1.0
matplotlib<=3.8
pyparsing<3,>=2.0.2
+torchcodec
# C++ docs
breathe==4.34.0
diff --git a/docs/source/index.rst b/docs/source/index.rst
index bee740a167..cb74f4e957 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -182,7 +182,7 @@ Tutorials
.. customcarditem::
:header: Loading waveform Tensors from files and saving them
- :card_description: Learn how to query/load audio files and save waveform tensors to files, using torchaudio.info
, torchaudio.load
and torchaudio.save
functions.
+ :card_description: Learn how to query/load audio files and save waveform tensors to files, using torchaudio.info
, torchaudio.utils.load_torchcodec
and torchaudio.save
functions.
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/audio_io_tutorial.png
:link: tutorials/audio_io_tutorial.html
:tags: I/O
@@ -399,7 +399,7 @@ In BibTeX format:
.. code-block:: bibtex
@misc{hwang2023torchaudio,
- title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
+ title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
year={2023},
eprint={2310.17864},
diff --git a/examples/asr/emformer_rnnt/mustc/dataset.py b/examples/asr/emformer_rnnt/mustc/dataset.py
index 7417aec164..7628fa2630 100644
--- a/examples/asr/emformer_rnnt/mustc/dataset.py
+++ b/examples/asr/emformer_rnnt/mustc/dataset.py
@@ -4,6 +4,7 @@
import torch
import torchaudio
import yaml
+from torchaudio.utils import load_torchcodec
FOLDER_IN_ARCHIVE = "en-de"
@@ -31,15 +32,15 @@ def __init__(
self.idx_target_lengths = []
self.wav_list = []
for idx, item in enumerate(file_list):
- offset = int(item["offset"] * SAMPLE_RATE)
- duration = int(item["duration"] * SAMPLE_RATE)
+ offset = item["offset"]
+ duration = item["duration"]
self.idx_target_lengths.append((idx, item["duration"]))
file_path = wav_dir / item["wav"]
self.wav_list.append((file_path, offset, duration))
def _get_mustc_item(self, idx):
file_path, offset, duration = self.wav_list[idx]
- waveform, sr = torchaudio.load(file_path, frame_offset=offset, num_frames=duration)
+ waveform, sr = load_torchcodec(file_path, start_seconds=offset, stop_seconds=offset + duration)
assert sr == SAMPLE_RATE
transcript = self.trans_list[idx].replace("\n", "")
return (waveform, transcript)
diff --git a/examples/avsr/data_prep/data/data_module.py b/examples/avsr/data_prep/data/data_module.py
index 542e26147a..3df611f2f8 100644
--- a/examples/avsr/data_prep/data/data_module.py
+++ b/examples/avsr/data_prep/data/data_module.py
@@ -7,7 +7,7 @@
import torch
import torchaudio
import torchvision
-
+from torchaudio.utils import load_torchcodec
class AVSRDataLoader:
def __init__(self, modality, detector="retinaface", resize=None):
@@ -39,7 +39,7 @@ def load_data(self, data_filename, transform=True):
return video
def load_audio(self, data_filename):
- waveform, sample_rate = torchaudio.load(data_filename, normalize=True)
+ waveform, sample_rate = load_torchcodec(data_filename, normalize=True)
return waveform, sample_rate
def load_video(self, data_filename):
diff --git a/examples/avsr/lrs3.py b/examples/avsr/lrs3.py
index b58d96a061..57a77872f7 100644
--- a/examples/avsr/lrs3.py
+++ b/examples/avsr/lrs3.py
@@ -3,6 +3,7 @@
import torchaudio
import torchvision
from torch.utils.data import Dataset
+from torchaudio.utils import load_torchcodec
def _load_list(args, *filenames):
@@ -31,7 +32,7 @@ def load_audio(path):
"""
rtype: torch, T x 1
"""
- waveform, sample_rate = torchaudio.load(path, normalize=True)
+ waveform, sample_rate = load_torchcodec(path, normalize=True)
return waveform.transpose(1, 0)
diff --git a/examples/dnn_beamformer/datamodule.py b/examples/dnn_beamformer/datamodule.py
index e6f81cbda2..fe82f96e08 100644
--- a/examples/dnn_beamformer/datamodule.py
+++ b/examples/dnn_beamformer/datamodule.py
@@ -8,6 +8,7 @@
from torch import Tensor
from torch.utils.data import Dataset
from utils import CollateFnL3DAS22
+from torchaudio.utils import load_torchcodec
_PREFIX = "L3DAS22_Task1_"
_SUBSETS = {
@@ -46,10 +47,10 @@ def __getitem__(self, n: int) -> Tuple[Tensor, Tensor, int, str]:
noisy_path_B = str(noisy_path_A).replace("_A.wav", "_B.wav")
clean_path = noisy_path_A.parent.parent / "labels" / noisy_path_A.name.replace("_A.wav", ".wav")
transcript_path = str(clean_path).replace("wav", "txt")
- waveform_noisy_A, sample_rate1 = torchaudio.load(noisy_path_A)
- waveform_noisy_B, sample_rate2 = torchaudio.load(noisy_path_B)
+ waveform_noisy_A, sample_rate1 = load_torchcodec(noisy_path_A)
+ waveform_noisy_B, sample_rate2 = load_torchcodec(noisy_path_B)
waveform_noisy = torch.cat((waveform_noisy_A, waveform_noisy_B), dim=0)
- waveform_clean, sample_rate3 = torchaudio.load(clean_path)
+ waveform_clean, sample_rate3 = load_torchcodec(clean_path)
assert sample_rate1 == _SAMPLE_RATE and sample_rate2 == _SAMPLE_RATE and sample_rate3 == _SAMPLE_RATE
with open(transcript_path, "r") as f:
transcript = f.readline()
diff --git a/examples/hubert/dataset/hubert_dataset.py b/examples/hubert/dataset/hubert_dataset.py
index 3670628fa1..967967f549 100644
--- a/examples/hubert/dataset/hubert_dataset.py
+++ b/examples/hubert/dataset/hubert_dataset.py
@@ -12,6 +12,9 @@
from torch import Tensor
from torch.utils.data import BatchSampler, Dataset, DistributedSampler
+from torchaudio.utils import load_torchcodec
+
+
sys.path.append("..")
from utils import _get_label2id
@@ -299,7 +302,7 @@ def _load_audio(self, index: int) -> Tensor:
(Tensor): The corresponding waveform Tensor.
"""
wav_path = self.f_list[index]
- waveform, sample_rate = torchaudio.load(wav_path)
+ waveform, sample_rate = load_torchcodec(wav_path)
assert waveform.shape[1] == self.len_list[index]
return waveform
diff --git a/examples/hubert/utils/feature_utils.py b/examples/hubert/utils/feature_utils.py
index 534d4f10fe..918d7cfcd5 100644
--- a/examples/hubert/utils/feature_utils.py
+++ b/examples/hubert/utils/feature_utils.py
@@ -13,6 +13,7 @@
from torch.nn import Module
from .common_utils import _get_feat_lens_paths
+from torchaudio.utils import load_torchcodec
_LG = logging.getLogger(__name__)
_DEFAULT_DEVICE = torch.device("cpu")
@@ -53,7 +54,7 @@ def extract_feature_mfcc(
Returns:
Tensor: The desired feature tensor of the given audio file.
"""
- waveform, sr = torchaudio.load(path)
+ waveform, sr = load_torchcodec(path)
assert sr == sample_rate
feature_extractor = torchaudio.transforms.MFCC(
sample_rate=sample_rate, n_mfcc=13, melkwargs={"n_fft": 400, "hop_length": 160, "center": False}
@@ -88,7 +89,7 @@ def extract_feature_hubert(
Returns:
Tensor: The desired feature tensor of the given audio file.
"""
- waveform, sr = torchaudio.load(path)
+ waveform, sr = load_torchcodec(path)
assert sr == sample_rate
waveform = waveform.to(device)
with torch.inference_mode():
diff --git a/examples/libtorchaudio/augmentation/create_jittable_pipeline.py b/examples/libtorchaudio/augmentation/create_jittable_pipeline.py
index 79f56819fc..b050de04d4 100755
--- a/examples/libtorchaudio/augmentation/create_jittable_pipeline.py
+++ b/examples/libtorchaudio/augmentation/create_jittable_pipeline.py
@@ -7,7 +7,7 @@
import torch
import torchaudio
-
+from torchaudio.utils import load_torchcodec
class Pipeline(torch.nn.Module):
"""Example audio process pipeline.
@@ -17,7 +17,7 @@ class Pipeline(torch.nn.Module):
def __init__(self, rir_path: str):
super().__init__()
- rir, sample_rate = torchaudio.load(rir_path)
+ rir, sample_rate = load_torchcodec(rir_path)
self.register_buffer("rir", rir)
self.rir_sample_rate: int = sample_rate
@@ -25,7 +25,7 @@ def forward(self, input_path: str, output_path: str):
torchaudio.sox_effects.init_sox_effects()
# 1. load audio
- waveform, sample_rate = torchaudio.load(input_path)
+ waveform, sample_rate = load_torchcodec(input_path)
# 2. Add background noise
alpha = 0.01
diff --git a/examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py b/examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py
index dcbe3c011a..9a175601f6 100644
--- a/examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py
+++ b/examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py
@@ -14,6 +14,7 @@
from greedy_decoder import Decoder
from torch.utils.mobile_optimizer import optimize_for_mobile
from torchaudio.models.wav2vec2.utils.import_fairseq import import_fairseq_model
+from torchaudio.utils import load_torchcodec
TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
if TORCH_VERSION >= (1, 10):
@@ -58,7 +59,7 @@ def _parse_args():
class Loader(torch.nn.Module):
def forward(self, audio_path: str) -> torch.Tensor:
- waveform, sample_rate = torchaudio.load(audio_path)
+ waveform, sample_rate = load_torchcodec(audio_path)
if sample_rate != 16000:
waveform = torchaudio.functional.resample(waveform, float(sample_rate), 16000.0)
return waveform
diff --git a/examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py b/examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py
index 344d3d09a2..6e0b05b1df 100644
--- a/examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py
+++ b/examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py
@@ -8,6 +8,7 @@
import torchaudio
from greedy_decoder import Decoder
from torchaudio.models.wav2vec2.utils.import_huggingface import import_huggingface_model
+from torchaudio.utils import load_torchcodec
TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
if TORCH_VERSION >= (1, 10):
@@ -49,7 +50,7 @@ def _parse_args():
class Loader(torch.nn.Module):
def forward(self, audio_path: str) -> torch.Tensor:
- waveform, sample_rate = torchaudio.load(audio_path)
+ waveform, sample_rate = load_torchcodec(audio_path)
if sample_rate != 16000:
waveform = torchaudio.functional.resample(waveform, float(sample_rate), 16000.0)
return waveform
diff --git a/examples/self_supervised_learning/data_modules/_utils.py b/examples/self_supervised_learning/data_modules/_utils.py
index 0333ca605d..b63eb77a43 100644
--- a/examples/self_supervised_learning/data_modules/_utils.py
+++ b/examples/self_supervised_learning/data_modules/_utils.py
@@ -8,6 +8,7 @@
import torchaudio
from torch import Tensor
from torch.utils.data import BatchSampler, Dataset, DistributedSampler
+from torchaudio.utils import load_torchcodec
from ..lightning_modules import Batch
@@ -295,7 +296,7 @@ def _load_audio(self, index: int) -> Tensor:
(Tensor): The corresponding waveform Tensor.
"""
wav_path = self.f_list[index]
- waveform, sample_rate = torchaudio.load(wav_path)
+ waveform, sample_rate = load_torchcodec(wav_path)
assert waveform.shape[1] == self.len_list[index]
return waveform
diff --git a/examples/source_separation/utils/dataset/wsj0mix.py b/examples/source_separation/utils/dataset/wsj0mix.py
index 3d3c5f826d..8846ce3f42 100644
--- a/examples/source_separation/utils/dataset/wsj0mix.py
+++ b/examples/source_separation/utils/dataset/wsj0mix.py
@@ -4,6 +4,7 @@
import torch
import torchaudio
from torch.utils.data import Dataset
+from torchaudio.utils import load_torchcodec
SampleType = Tuple[int, torch.Tensor, List[torch.Tensor]]
@@ -37,7 +38,7 @@ def __init__(
self.files.sort()
def _load_audio(self, path) -> torch.Tensor:
- waveform, sample_rate = torchaudio.load(path)
+ waveform, sample_rate = load_torchcodec(path)
if sample_rate != self.sample_rate:
raise ValueError(
f"The dataset contains audio file of sample rate {sample_rate}, "
diff --git a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
index 624cd8066a..775492a53c 100644
--- a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
+++ b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
@@ -65,6 +65,7 @@
import matplotlib.pyplot as plt
from torchaudio.models.decoder import ctc_decoder
from torchaudio.utils import download_asset
+from torchaudio.utils import load_torchcodec
######################################################################
#
@@ -98,7 +99,7 @@
# i really was very much afraid of showing him how much shocked i was at some parts of what he said
#
-waveform, sample_rate = torchaudio.load(speech_file)
+waveform, sample_rate = load_torchcodec(speech_file)
if sample_rate != bundle.sample_rate:
waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
diff --git a/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
index 8329d8a40e..ae17513c35 100755
--- a/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
+++ b/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
@@ -54,6 +54,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -96,7 +97,7 @@ def download_asset_external(url, key):
#
speech_file = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
-waveform, sample_rate = torchaudio.load(speech_file)
+waveform, sample_rate = load_torchcodec(speech_file)
assert sample_rate == 16000
IPython.display.Audio(speech_file)
diff --git a/examples/tutorials/audio_data_augmentation_tutorial.py b/examples/tutorials/audio_data_augmentation_tutorial.py
index 734cb57bb4..7b3bc6042d 100644
--- a/examples/tutorials/audio_data_augmentation_tutorial.py
+++ b/examples/tutorials/audio_data_augmentation_tutorial.py
@@ -15,6 +15,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
import torchaudio.functional as F
print(torch.__version__)
@@ -52,7 +53,7 @@
#
# Load the data
-waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False)
+waveform1, sample_rate = load_torchcodec(SAMPLE_WAV, channels_first=False)
# Define effects
effect = ",".join(
@@ -159,7 +160,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
# and clap your hands.
#
-rir_raw, sample_rate = torchaudio.load(SAMPLE_RIR)
+rir_raw, sample_rate = load_torchcodec(SAMPLE_RIR)
plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)")
plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
Audio(rir_raw, rate=sample_rate)
@@ -179,7 +180,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
# we convolve the speech signal with the RIR.
#
-speech, _ = torchaudio.load(SAMPLE_SPEECH)
+speech, _ = load_torchcodec(SAMPLE_SPEECH)
augmented = F.fftconvolve(speech, rir)
######################################################################
@@ -219,8 +220,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
# To add noise to audio data per SNRs, we
# use :py:func:`torchaudio.functional.add_noise`.
-speech, _ = torchaudio.load(SAMPLE_SPEECH)
-noise, _ = torchaudio.load(SAMPLE_NOISE)
+speech, _ = load_torchcodec(SAMPLE_SPEECH)
+noise, _ = load_torchcodec(SAMPLE_NOISE)
noise = noise[:, : speech.shape[1]]
snr_dbs = torch.tensor([20, 10, 3])
@@ -275,7 +276,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
# a Tensor object.
#
-waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False)
+waveform, sample_rate = load_torchcodec(SAMPLE_SPEECH, channels_first=False)
def apply_codec(waveform, sample_rate, format, encoder=None):
@@ -332,7 +333,7 @@ def apply_codec(waveform, sample_rate, format, encoder=None):
#
sample_rate = 16000
-original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
+original_speech, sample_rate = load_torchcodec(SAMPLE_SPEECH)
plot_specgram(original_speech, sample_rate, title="Original")
@@ -345,7 +346,7 @@ def apply_codec(waveform, sample_rate, format, encoder=None):
# Because the noise is recorded in the actual environment, we consider that
# the noise contains the acoustic feature of the environment. Therefore, we add
# the noise after RIR application.
-noise, _ = torchaudio.load(SAMPLE_NOISE)
+noise, _ = load_torchcodec(SAMPLE_NOISE)
noise = noise[:, : rir_applied.shape[1]]
snr_db = torch.tensor([8])
diff --git a/examples/tutorials/audio_feature_extractions_tutorial.py b/examples/tutorials/audio_feature_extractions_tutorial.py
index eb43c6dca8..7b81333e1c 100644
--- a/examples/tutorials/audio_feature_extractions_tutorial.py
+++ b/examples/tutorials/audio_feature_extractions_tutorial.py
@@ -21,6 +21,7 @@
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -103,7 +104,7 @@ def plot_fbank(fbank, title=None):
#
# Load audio
-SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH)
+SPEECH_WAVEFORM, SAMPLE_RATE = load_torchcodec(SAMPLE_SPEECH)
# Define transform
spectrogram = T.Spectrogram(n_fft=512)
diff --git a/examples/tutorials/audio_io_tutorial.py b/examples/tutorials/audio_io_tutorial.py
index ddcd931f62..daf6cd20ef 100644
--- a/examples/tutorials/audio_io_tutorial.py
+++ b/examples/tutorials/audio_io_tutorial.py
@@ -22,6 +22,8 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
+from io import BytesIO
print(torch.__version__)
print(torchaudio.__version__)
@@ -151,7 +153,7 @@ def read(self, n):
# Loading audio data
# ------------------
#
-# To load audio data, you can use :py:func:`torchaudio.load`.
+# To load audio data, you can use :py:func:`load_torchcodec`.
#
# This function accepts a path-like object or file-like object as input.
#
@@ -165,7 +167,7 @@ def read(self, n):
# documentation `__.
#
-waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
+waveform, sample_rate = load_torchcodec(SAMPLE_WAV)
######################################################################
@@ -219,10 +221,10 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
Audio(waveform.numpy()[0], rate=sample_rate)
######################################################################
-# Loading from file-like object
+# Loading from URLs and file-like object
# -----------------------------
#
-# The I/O functions support file-like objects.
+# The I/O functions support URLs and file-like objects.
# This allows for fetching and decoding audio data from locations
# within and beyond the local file system.
# The following examples illustrate this.
@@ -231,10 +233,9 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
######################################################################
#
-# Load audio data as HTTP request
+# Load audio data from an HTTP request
url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
-with requests.get(url, stream=True) as response:
- waveform, sample_rate = torchaudio.load(_hide_seek(response.raw))
+waveform, sample_rate = load_torchcodec(url)
plot_specgram(waveform, sample_rate, title="HTTP datasource")
######################################################################
@@ -245,7 +246,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
tar_item = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
with tarfile.open(tar_path, mode="r") as tarfile_:
fileobj = tarfile_.extractfile(tar_item)
- waveform, sample_rate = torchaudio.load(fileobj)
+ waveform, sample_rate = load_torchcodec(fileobj)
plot_specgram(waveform, sample_rate, title="TAR file")
######################################################################
@@ -256,7 +257,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
key = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
response = client.get_object(Bucket=bucket, Key=key)
-waveform, sample_rate = torchaudio.load(_hide_seek(response["Body"]))
+waveform, sample_rate = load_torchcodec(BytesIO(response['Body'].read()))
plot_specgram(waveform, sample_rate, title="From S3")
@@ -289,17 +290,13 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
print("Fetching all the data...")
-with requests.get(url, stream=True) as response:
- waveform1, sample_rate1 = torchaudio.load(_hide_seek(response.raw))
- waveform1 = waveform1[:, frame_offset : frame_offset + num_frames]
- print(f" - Fetched {response.raw.tell()} bytes")
+waveform1, sample_rate1 = load_torchcodec(url)
+waveform1 = waveform1[:, frame_offset : frame_offset + num_frames]
print("Fetching until the requested frames are available...")
-with requests.get(url, stream=True) as response:
- waveform2, sample_rate2 = torchaudio.load(
- _hide_seek(response.raw), frame_offset=frame_offset, num_frames=num_frames
- )
- print(f" - Fetched {response.raw.tell()} bytes")
+waveform2, sample_rate2 = load_torchcodec(
+ url, start_seconds=1, stop_seconds=2
+)
print("Checking the resulting waveform ... ", end="")
assert (waveform1 == waveform2).all()
@@ -331,7 +328,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
# resulting file size but also precision.
#
-waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
+waveform, sample_rate = load_torchcodec(SAMPLE_WAV)
######################################################################
@@ -383,7 +380,7 @@ def inspect_file(path):
######################################################################
#
-waveform, sample_rate = torchaudio.load(SAMPLE_WAV_8000)
+waveform, sample_rate = load_torchcodec(SAMPLE_WAV_8000)
with tempfile.TemporaryDirectory() as tempdir:
for format in formats:
path = f"{tempdir}/save_example.{format}"
@@ -400,7 +397,7 @@ def inspect_file(path):
#
-waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
+waveform, sample_rate = load_torchcodec(SAMPLE_WAV)
# Saving to bytes buffer
buffer_ = io.BytesIO()
diff --git a/examples/tutorials/ctc_forced_alignment_api_tutorial.py b/examples/tutorials/ctc_forced_alignment_api_tutorial.py
index 789fa3cf85..610ccc9abc 100644
--- a/examples/tutorials/ctc_forced_alignment_api_tutorial.py
+++ b/examples/tutorials/ctc_forced_alignment_api_tutorial.py
@@ -39,6 +39,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -63,7 +64,7 @@
#
SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
-waveform, _ = torchaudio.load(SPEECH_FILE)
+waveform, _ = load_torchcodec(SPEECH_FILE)
TRANSCRIPT = "i had that curiosity beside me at this moment".split()
diff --git a/examples/tutorials/effector_tutorial.py b/examples/tutorials/effector_tutorial.py
index 8eadcf6ef4..dffa35e893 100644
--- a/examples/tutorials/effector_tutorial.py
+++ b/examples/tutorials/effector_tutorial.py
@@ -43,6 +43,7 @@
#
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -92,7 +93,7 @@
#
src = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
-waveform, sr = torchaudio.load(src, channels_first=False)
+waveform, sr = load_torchcodec(src, channels_first=False)
######################################################################
diff --git a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
index 00dfe68b9d..aa21a6076a 100644
--- a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
+++ b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
@@ -26,6 +26,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -244,9 +245,8 @@ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sam
text_normalized = "aber seit ich bei ihnen das brot hole"
url = "https://download.pytorch.org/torchaudio/tutorial-assets/10349_8674_000087.flac"
-waveform, sample_rate = torchaudio.load(
- url, frame_offset=int(0.5 * bundle.sample_rate), num_frames=int(2.5 * bundle.sample_rate)
-)
+waveform, sample_rate = load_torchcodec(
+ url, start_seconds=0.5, stop_seconds=3)
######################################################################
#
@@ -326,7 +326,7 @@ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sam
#
url = "https://download.pytorch.org/torchaudio/tutorial-assets/mvdr/clean_speech.wav"
-waveform, sample_rate = torchaudio.load(url)
+waveform, sample_rate = load_torchcodec(url)
waveform = waveform[0:1]
######################################################################
@@ -400,7 +400,7 @@ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sam
text_normalized = "wtedy ujrzalem na jego brzuchu okragla czarna rane"
url = "https://download.pytorch.org/torchaudio/tutorial-assets/5090_1447_000088.flac"
-waveform, sample_rate = torchaudio.load(url, num_frames=int(4.5 * bundle.sample_rate))
+waveform, sample_rate = load_torchcodec(url, stop_seconds=4.5)
######################################################################
#
@@ -467,9 +467,7 @@ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sam
text_normalized = "na imensa extensao onde se esconde o inconsciente imortal"
url = "https://download.pytorch.org/torchaudio/tutorial-assets/6566_5323_000027.flac"
-waveform, sample_rate = torchaudio.load(
- url, frame_offset=int(bundle.sample_rate), num_frames=int(4.6 * bundle.sample_rate)
-)
+waveform, sample_rate = load_torchcodec(url, start_seconds=1, stop_seconds=4.6)
######################################################################
#
@@ -542,7 +540,7 @@ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sam
text_normalized = "elle giacean per terra tutte quante"
url = "https://download.pytorch.org/torchaudio/tutorial-assets/642_529_000025.flac"
-waveform, sample_rate = torchaudio.load(url, num_frames=int(4 * bundle.sample_rate))
+waveform, sample_rate = load_torchcodec(url, stop_seconds=4)
######################################################################
#
diff --git a/examples/tutorials/forced_alignment_tutorial.py b/examples/tutorials/forced_alignment_tutorial.py
index 624037da9d..a10fea4dcc 100644
--- a/examples/tutorials/forced_alignment_tutorial.py
+++ b/examples/tutorials/forced_alignment_tutorial.py
@@ -42,6 +42,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -106,7 +107,7 @@
model = bundle.get_model().to(device)
labels = bundle.get_labels()
with torch.inference_mode():
- waveform, _ = torchaudio.load(SPEECH_FILE)
+ waveform, _ = load_torchcodec(SPEECH_FILE)
emissions, _ = model(waveform.to(device))
emissions = torch.log_softmax(emissions, dim=-1)
diff --git a/examples/tutorials/hybrid_demucs_tutorial.py b/examples/tutorials/hybrid_demucs_tutorial.py
index 081534bfe4..6bb90d9987 100644
--- a/examples/tutorials/hybrid_demucs_tutorial.py
+++ b/examples/tutorials/hybrid_demucs_tutorial.py
@@ -41,6 +41,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -187,7 +188,7 @@ def plot_spectrogram(stft, title="Spectrogram"):
# We download the audio file from our storage. Feel free to download another file and use audio from a specific path
SAMPLE_SONG = download_asset("tutorial-assets/hdemucs_mix.wav")
-waveform, sample_rate = torchaudio.load(SAMPLE_SONG) # replace SAMPLE_SONG with desired path for different song
+waveform, sample_rate = load_torchcodec(SAMPLE_SONG) # replace SAMPLE_SONG with desired path for different song
waveform = waveform.to(device)
mixture = waveform
@@ -267,16 +268,16 @@ def output_results(original_source: torch.Tensor, predicted_source: torch.Tensor
other_original = download_asset("tutorial-assets/hdemucs_other_segment.wav")
drums_spec = audios["drums"][:, frame_start:frame_end].cpu()
-drums, sample_rate = torchaudio.load(drums_original)
+drums, sample_rate = load_torchcodec(drums_original)
bass_spec = audios["bass"][:, frame_start:frame_end].cpu()
-bass, sample_rate = torchaudio.load(bass_original)
+bass, sample_rate = load_torchcodec(bass_original)
vocals_spec = audios["vocals"][:, frame_start:frame_end].cpu()
-vocals, sample_rate = torchaudio.load(vocals_original)
+vocals, sample_rate = load_torchcodec(vocals_original)
other_spec = audios["other"][:, frame_start:frame_end].cpu()
-other, sample_rate = torchaudio.load(other_original)
+other, sample_rate = load_torchcodec(other_original)
mix_spec = mixture[:, frame_start:frame_end].cpu()
diff --git a/examples/tutorials/mvdr_tutorial.py b/examples/tutorials/mvdr_tutorial.py
index 442f6234a6..8c9e59dcf6 100644
--- a/examples/tutorials/mvdr_tutorial.py
+++ b/examples/tutorials/mvdr_tutorial.py
@@ -31,6 +31,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
import torchaudio.functional as F
print(torch.__version__)
@@ -170,8 +171,8 @@ def evaluate(estimate, reference):
# ~~~~~~~~~~~~~~~~~~~~
#
-waveform_clean, sr = torchaudio.load(SAMPLE_CLEAN)
-waveform_noise, sr2 = torchaudio.load(SAMPLE_NOISE)
+waveform_clean, sr = load_torchcodec(SAMPLE_CLEAN)
+waveform_noise, sr2 = load_torchcodec(SAMPLE_NOISE)
assert sr == sr2 == SAMPLE_RATE
# The mixture waveform is a combination of clean and noise waveforms with a desired SNR.
target_snr = 3
diff --git a/examples/tutorials/speech_recognition_pipeline_tutorial.py b/examples/tutorials/speech_recognition_pipeline_tutorial.py
index 2d815a2e8e..83c7ec0f3b 100644
--- a/examples/tutorials/speech_recognition_pipeline_tutorial.py
+++ b/examples/tutorials/speech_recognition_pipeline_tutorial.py
@@ -37,6 +37,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -114,7 +115,7 @@
######################################################################
-# To load data, we use :py:func:`torchaudio.load`.
+# To load data, we use :py:func:`load_torchcodec`.
#
# If the sampling rate is different from what the pipeline expects, then
# we can use :py:func:`torchaudio.functional.resample` for resampling.
@@ -126,7 +127,7 @@
# using :py:class:`torchaudio.transforms.Resample` might improve the performace.
#
-waveform, sample_rate = torchaudio.load(SPEECH_FILE)
+waveform, sample_rate = load_torchcodec(SPEECH_FILE)
waveform = waveform.to(device)
if sample_rate != bundle.sample_rate:
diff --git a/examples/tutorials/squim_tutorial.py b/examples/tutorials/squim_tutorial.py
index 9b9b55ac2e..792f2356d9 100644
--- a/examples/tutorials/squim_tutorial.py
+++ b/examples/tutorials/squim_tutorial.py
@@ -62,6 +62,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -158,8 +159,8 @@ def plot(waveform, title, sample_rate=16000):
#
#
-WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(SAMPLE_SPEECH)
-WAVEFORM_NOISE, SAMPLE_RATE_NOISE = torchaudio.load(SAMPLE_NOISE)
+WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = load_torchcodec(SAMPLE_SPEECH)
+WAVEFORM_NOISE, SAMPLE_RATE_NOISE = load_torchcodec(SAMPLE_NOISE)
WAVEFORM_NOISE = WAVEFORM_NOISE[0:1, :]
@@ -328,7 +329,7 @@ def plot(waveform, title, sample_rate=16000):
NMR_SPEECH = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
-WAVEFORM_NMR, SAMPLE_RATE_NMR = torchaudio.load(NMR_SPEECH)
+WAVEFORM_NMR, SAMPLE_RATE_NMR = load_torchcodec(NMR_SPEECH)
if SAMPLE_RATE_NMR != 16000:
WAVEFORM_NMR = F.resample(WAVEFORM_NMR, SAMPLE_RATE_NMR, 16000)
diff --git a/examples/tutorials/streamwriter_advanced.py b/examples/tutorials/streamwriter_advanced.py
index 37347d1387..29f0efe111 100644
--- a/examples/tutorials/streamwriter_advanced.py
+++ b/examples/tutorials/streamwriter_advanced.py
@@ -64,6 +64,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -128,7 +129,7 @@
#
# Prepare sample audio
-waveform, sample_rate = torchaudio.load(AUDIO_PATH, channels_first=False, normalize=False)
+waveform, sample_rate = load_torchcodec(AUDIO_PATH, channels_first=False, normalize=False)
num_frames, num_channels = waveform.shape
######################################################################
diff --git a/examples/tutorials/streamwriter_basic_tutorial.py b/examples/tutorials/streamwriter_basic_tutorial.py
index 35af1a177d..714c4bbadc 100644
--- a/examples/tutorials/streamwriter_basic_tutorial.py
+++ b/examples/tutorials/streamwriter_basic_tutorial.py
@@ -52,6 +52,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
print(torch.__version__)
print(torchaudio.__version__)
@@ -74,7 +75,7 @@
from torchaudio.utils import download_asset
SAMPLE_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
-WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_PATH, channels_first=False)
+WAVEFORM, SAMPLE_RATE = load_torchcodec(SAMPLE_PATH, channels_first=False)
NUM_FRAMES, NUM_CHANNELS = WAVEFORM.shape
_BASE_DIR = tempfile.TemporaryDirectory()
diff --git a/src/torchaudio/datasets/cmuarctic.py b/src/torchaudio/datasets/cmuarctic.py
index 96f498f00f..10b2151e43 100644
--- a/src/torchaudio/datasets/cmuarctic.py
+++ b/src/torchaudio/datasets/cmuarctic.py
@@ -4,6 +4,7 @@
from typing import Tuple, Union
import torchaudio
+from torchaudio.utils import load_torchcodec
from torch import Tensor
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
@@ -43,7 +44,7 @@ def load_cmuarctic_item(line: str, path: str, folder_audio: str, ext_audio: str)
file_audio = os.path.join(path, folder_audio, utterance_id + ext_audio)
# Load audio
- waveform, sample_rate = torchaudio.load(file_audio)
+ waveform, sample_rate = load_torchcodec(file_audio)
return (waveform, sample_rate, transcript, utterance_id.split("_")[1])
diff --git a/src/torchaudio/datasets/commonvoice.py b/src/torchaudio/datasets/commonvoice.py
index db0e035c61..d926e22d03 100644
--- a/src/torchaudio/datasets/commonvoice.py
+++ b/src/torchaudio/datasets/commonvoice.py
@@ -6,6 +6,7 @@
import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
+from torchaudio.utils import load_torchcodec
def load_commonvoice_item(
@@ -20,7 +21,7 @@ def load_commonvoice_item(
filename = os.path.join(path, folder_audio, fileid)
if not filename.endswith(ext_audio):
filename += ext_audio
- waveform, sample_rate = torchaudio.load(filename)
+ waveform, sample_rate = load_torchcodec(filename)
dic = dict(zip(header, line))
diff --git a/src/torchaudio/datasets/dr_vctk.py b/src/torchaudio/datasets/dr_vctk.py
index a634b96894..dde5326a8e 100644
--- a/src/torchaudio/datasets/dr_vctk.py
+++ b/src/torchaudio/datasets/dr_vctk.py
@@ -6,6 +6,7 @@
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_zip
+from torchaudio.utils import load_torchcodec
_URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"
@@ -75,8 +76,8 @@ def _load_dr_vctk_item(self, filename: str) -> Tuple[Tensor, int, Tensor, int, s
source, channel_id = self._config[filename]
file_clean_audio = self._clean_audio_dir / filename
file_noisy_audio = self._noisy_audio_dir / filename
- waveform_clean, sample_rate_clean = torchaudio.load(file_clean_audio)
- waveform_noisy, sample_rate_noisy = torchaudio.load(file_noisy_audio)
+ waveform_clean, sample_rate_clean = load_torchcodec(file_clean_audio)
+ waveform_noisy, sample_rate_noisy = load_torchcodec(file_noisy_audio)
return (
waveform_clean,
sample_rate_clean,
diff --git a/src/torchaudio/datasets/gtzan.py b/src/torchaudio/datasets/gtzan.py
index 347e7e7183..2fc5e4d357 100644
--- a/src/torchaudio/datasets/gtzan.py
+++ b/src/torchaudio/datasets/gtzan.py
@@ -7,6 +7,7 @@
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar
+from torchaudio.utils import load_torchcodec
# The following lists prefixed with `filtered_` provide a filtered split
# that:
@@ -990,7 +991,7 @@ def load_gtzan_item(fileid: str, path: str, ext_audio: str) -> Tuple[Tensor, str
# Read wav
file_audio = os.path.join(path, label, fileid + ext_audio)
- waveform, sample_rate = torchaudio.load(file_audio)
+ waveform, sample_rate = load_torchcodec(file_audio)
return waveform, sample_rate, label
diff --git a/src/torchaudio/datasets/librilight_limited.py b/src/torchaudio/datasets/librilight_limited.py
index f0cb3100f7..01dcb99f1f 100644
--- a/src/torchaudio/datasets/librilight_limited.py
+++ b/src/torchaudio/datasets/librilight_limited.py
@@ -8,6 +8,7 @@
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.librispeech import _get_librispeech_metadata
from torchaudio.datasets.utils import _extract_tar
+from torchaudio.utils import load_torchcodec
_ARCHIVE_NAME = "librispeech_finetuning"
@@ -104,7 +105,7 @@ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
"""
file_path, fileid = self._fileids_paths[n]
metadata = _get_librispeech_metadata(fileid, self._path, file_path, self._ext_audio, self._ext_txt)
- waveform, _ = torchaudio.load(os.path.join(self._path, metadata[0]))
+ waveform, _ = load_torchcodec(os.path.join(self._path, metadata[0]))
return (waveform,) + metadata[1:]
def __len__(self) -> int:
diff --git a/src/torchaudio/datasets/libritts.py b/src/torchaudio/datasets/libritts.py
index 829ce95729..95a878ce02 100644
--- a/src/torchaudio/datasets/libritts.py
+++ b/src/torchaudio/datasets/libritts.py
@@ -7,6 +7,7 @@
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar
+from torchaudio.utils import load_torchcodec
URL = "train-clean-100"
FOLDER_IN_ARCHIVE = "LibriTTS"
@@ -41,7 +42,7 @@ def load_libritts_item(
file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)
# Load audio
- waveform, sample_rate = torchaudio.load(file_audio)
+ waveform, sample_rate = load_torchcodec(file_audio)
# Load original text
with open(original_text) as ft:
diff --git a/src/torchaudio/datasets/ljspeech.py b/src/torchaudio/datasets/ljspeech.py
index 9cdaeeb0f3..d9a5554cfc 100644
--- a/src/torchaudio/datasets/ljspeech.py
+++ b/src/torchaudio/datasets/ljspeech.py
@@ -8,7 +8,7 @@
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar
-
+from torchaudio.utils import load_torchcodec
_RELEASE_CONFIGS = {
"release1": {
@@ -94,7 +94,7 @@ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]:
fileid_audio = self._path / (fileid + ".wav")
# Load audio
- waveform, sample_rate = torchaudio.load(fileid_audio)
+ waveform, sample_rate = load_torchcodec(fileid_audio)
return (
waveform,
diff --git a/src/torchaudio/datasets/musdb_hq.py b/src/torchaudio/datasets/musdb_hq.py
index dd4bc9f340..a74de61370 100644
--- a/src/torchaudio/datasets/musdb_hq.py
+++ b/src/torchaudio/datasets/musdb_hq.py
@@ -7,6 +7,7 @@
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_zip
+from torchaudio.utils import load_torchcodec
_URL = "https://zenodo.org/record/3338373/files/musdb18hq.zip"
_CHECKSUM = "baac80d0483c61d74b2e5f3be75fa557eec52898339e6aa45c1fa48833c5d21d"
@@ -87,7 +88,7 @@ def _load_sample(self, n: int) -> Tuple[torch.Tensor, int, int, str]:
num_frames = None
for source in self.sources:
track = self._get_track(name, source)
- wav, sr = torchaudio.load(str(track))
+ wav, sr = load_torchcodec(str(track))
if sr != _SAMPLE_RATE:
raise ValueError(f"expected sample rate {_SAMPLE_RATE}, but got {sr}")
if num_frames is None:
diff --git a/src/torchaudio/datasets/tedlium.py b/src/torchaudio/datasets/tedlium.py
index 7e7d22195a..3c7182100b 100644
--- a/src/torchaudio/datasets/tedlium.py
+++ b/src/torchaudio/datasets/tedlium.py
@@ -7,6 +7,7 @@
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar
+from torchaudio.utils import load_torchcodec
_RELEASE_CONFIGS = {
@@ -163,12 +164,7 @@ def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate
Returns:
[Tensor, int]: Audio tensor representation and sample rate
"""
- start_time = int(float(start_time) * sample_rate)
- end_time = int(float(end_time) * sample_rate)
-
- kwargs = {"frame_offset": start_time, "num_frames": end_time - start_time}
-
- return torchaudio.load(path, **kwargs)
+ return load_torchcodec(path, start_seconds=float(start_time), stop_seconds=float(end_time))
def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
"""Load the n-th sample from the dataset.
diff --git a/src/torchaudio/datasets/utils.py b/src/torchaudio/datasets/utils.py
index b4599f83aa..2952510eab 100644
--- a/src/torchaudio/datasets/utils.py
+++ b/src/torchaudio/datasets/utils.py
@@ -3,6 +3,7 @@
import tarfile
import zipfile
from typing import Any, List, Optional
+from torchaudio.utils import load_torchcodec
import torchaudio
@@ -48,7 +49,7 @@ def _load_waveform(
exp_sample_rate: int,
):
path = os.path.join(root, filename)
- waveform, sample_rate = torchaudio.load(path)
+ waveform, sample_rate = load_torchcodec(path)
if exp_sample_rate != sample_rate:
raise ValueError(f"sample rate should be {exp_sample_rate}, but got {sample_rate}")
return waveform
diff --git a/src/torchaudio/datasets/vctk.py b/src/torchaudio/datasets/vctk.py
index 3195b9b427..4879c5274e 100644
--- a/src/torchaudio/datasets/vctk.py
+++ b/src/torchaudio/datasets/vctk.py
@@ -6,6 +6,7 @@
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_zip
+from torchaudio.utils import load_torchcodec
URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
_CHECKSUMS = {
@@ -98,7 +99,7 @@ def _load_text(self, file_path) -> str:
return file_path.readlines()[0]
def _load_audio(self, file_path) -> Tuple[Tensor, int]:
- return torchaudio.load(file_path)
+ return load_torchcodec(file_path)
def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> SampleType:
transcript_path = os.path.join(self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt")
diff --git a/src/torchaudio/datasets/yesno.py b/src/torchaudio/datasets/yesno.py
index baad08f159..ba42775be8 100644
--- a/src/torchaudio/datasets/yesno.py
+++ b/src/torchaudio/datasets/yesno.py
@@ -7,7 +7,7 @@
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_tar
-
+from torchaudio.utils import load_torchcodec
_RELEASE_CONFIGS = {
"release1": {
@@ -62,7 +62,7 @@ def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, downloa
def _load_item(self, fileid: str, path: str):
labels = [int(c) for c in fileid.split("_")]
file_audio = os.path.join(path, fileid + ".wav")
- waveform, sample_rate = torchaudio.load(file_audio)
+ waveform, sample_rate = load_torchcodec(file_audio)
return waveform, sample_rate, labels
def __getitem__(self, n: int) -> Tuple[Tensor, int, List[int]]:
diff --git a/src/torchaudio/models/wav2vec2/utils/import_fairseq.py b/src/torchaudio/models/wav2vec2/utils/import_fairseq.py
index 39791e9b7d..d255730e53 100644
--- a/src/torchaudio/models/wav2vec2/utils/import_fairseq.py
+++ b/src/torchaudio/models/wav2vec2/utils/import_fairseq.py
@@ -140,7 +140,7 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model:
Example - Loading pretrain-only model
>>> from torchaudio.models.wav2vec2.utils import import_fairseq_model
- >>>
+ >>> from torchaudio.utils import load_torchcodec
>>> # Load model using fairseq
>>> model_file = 'wav2vec_small.pt'
>>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
@@ -148,7 +148,7 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model:
>>> imported = import_fairseq_model(original)
>>>
>>> # Perform feature extraction
- >>> waveform, _ = torchaudio.load('audio.wav')
+ >>> waveform, _ = load_torchcodec('audio.wav')
>>> features, _ = imported.extract_features(waveform)
>>>
>>> # Compare result with the original model from fairseq
@@ -157,7 +157,7 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model:
Example - Fine-tuned model
>>> from torchaudio.models.wav2vec2.utils import import_fairseq_model
- >>>
+ >>> from torchaudio.utils import load_torchcodec
>>> # Load model using fairseq
>>> model_file = 'wav2vec_small_960h.pt'
>>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
@@ -165,7 +165,7 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model:
>>> imported = import_fairseq_model(original.w2v_encoder)
>>>
>>> # Perform encoding
- >>> waveform, _ = torchaudio.load('audio.wav')
+ >>> waveform, _ = load_torchcodec('audio.wav')
>>> emission, _ = imported(waveform)
>>>
>>> # Compare result with the original model from fairseq
diff --git a/src/torchaudio/models/wav2vec2/utils/import_huggingface.py b/src/torchaudio/models/wav2vec2/utils/import_huggingface.py
index 519d8c919f..7187536d25 100644
--- a/src/torchaudio/models/wav2vec2/utils/import_huggingface.py
+++ b/src/torchaudio/models/wav2vec2/utils/import_huggingface.py
@@ -117,8 +117,8 @@ def import_huggingface_model(original: Module) -> Wav2Vec2Model:
>>>
>>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = import_huggingface_model(original)
- >>>
- >>> waveforms, _ = torchaudio.load("audio.wav")
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveforms, _ = load_torchcodec("audio.wav")
>>> logits, _ = model(waveforms)
"""
_LG.info("Importing model.")
diff --git a/src/torchaudio/models/wavernn.py b/src/torchaudio/models/wavernn.py
index 8ae5a3e916..c2367ed96b 100644
--- a/src/torchaudio/models/wavernn.py
+++ b/src/torchaudio/models/wavernn.py
@@ -222,7 +222,8 @@ class WaveRNN(nn.Module):
Example
>>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200)
- >>> waveform, sample_rate = torchaudio.load(file)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec(file)
>>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
>>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time)
>>> output = wavernn(waveform, specgram)
diff --git a/src/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py b/src/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py
index 0ae812f920..b23db4c9fc 100644
--- a/src/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py
+++ b/src/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py
@@ -22,12 +22,12 @@ class VGGishBundle:
Example:
>>> import torchaudio
>>> from torchaudio.prototype.pipelines import VGGISH
- >>>
+ >>> from torchaudio.utils import load_torchcodec
>>> input_sr = VGGISH.sample_rate
>>> input_proc = VGGISH.get_input_processor()
>>> model = VGGISH.get_model()
>>>
- >>> waveform, sr = torchaudio.load(
+ >>> waveform, sr = load_torchcodec(
>>> "Chopin_Ballade_-1_In_G_Minor,_Op._23.mp3",
>>> )
>>> waveform = waveform.squeeze(0)
diff --git a/src/torchaudio/prototype/transforms/_transforms.py b/src/torchaudio/prototype/transforms/_transforms.py
index 3390b3a583..88930c38b3 100644
--- a/src/torchaudio/prototype/transforms/_transforms.py
+++ b/src/torchaudio/prototype/transforms/_transforms.py
@@ -24,7 +24,8 @@ class BarkScale(torch.nn.Module):
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
>>> spectrogram = spectrogram_transform(waveform)
>>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
@@ -95,7 +96,8 @@ class InverseBarkScale(torch.nn.Module):
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024)
>>> mel_spectrogram = bark_spectrogram_transform(waveform)
>>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1)
@@ -230,7 +232,8 @@ class BarkSpectrogram(torch.nn.Module):
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.BarkSpectrogram(sample_rate)
>>> bark_specgram = transform(waveform) # (channel, n_barks, time)
@@ -320,7 +323,8 @@ class ChromaScale(torch.nn.Module):
base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
>>> spectrogram = spectrogram_transform(waveform)
>>> chroma_transform = transforms.ChromaScale(sample_rate=sample_rate, n_freqs=1024 // 2 + 1)
@@ -397,7 +401,8 @@ class ChromaSpectrogram(torch.nn.Module):
base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.ChromaSpectrogram(sample_rate=sample_rate, n_fft=400)
>>> chromagram = transform(waveform) # (channel, n_chroma, time)
"""
diff --git a/src/torchaudio/sox_effects/sox_effects.py b/src/torchaudio/sox_effects/sox_effects.py
index 256c461edc..b50925c2c2 100644
--- a/src/torchaudio/sox_effects/sox_effects.py
+++ b/src/torchaudio/sox_effects/sox_effects.py
@@ -151,7 +151,8 @@ def apply_effects_tensor(
>>> transform = torch.jit.load(path)
>>>
>>>> # Run transform
- >>> waveform, input_sample_rate = torchaudio.load("input.wav")
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, input_sample_rate = load_torchcodec("input.wav")
>>> waveform, sample_rate = transform(waveform, input_sample_rate)
>>> assert sample_rate == 8000
"""
diff --git a/src/torchaudio/transforms/_transforms.py b/src/torchaudio/transforms/_transforms.py
index 5bf914bc12..deeb7e0928 100644
--- a/src/torchaudio/transforms/_transforms.py
+++ b/src/torchaudio/transforms/_transforms.py
@@ -54,7 +54,8 @@ class Spectrogram(torch.nn.Module):
Deprecated and not used.
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = torchaudio.transforms.Spectrogram(n_fft=800)
>>> spectrogram = transform(waveform)
@@ -315,7 +316,8 @@ class AmplitudeToDB(torch.nn.Module):
number is 80. (Default: ``None``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.AmplitudeToDB(stype="amplitude", top_db=80)
>>> waveform_db = transform(waveform)
"""
@@ -364,7 +366,8 @@ class MelScale(torch.nn.Module):
mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
>>> spectrogram = spectrogram_transform(waveform)
>>> melscale_transform = transforms.MelScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
@@ -438,7 +441,8 @@ class InverseMelScale(torch.nn.Module):
(Default: ``"gels``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> mel_spectrogram_transform = transforms.MelSpectrogram(sample_rate, n_fft=1024)
>>> mel_spectrogram = mel_spectrogram_transform(waveform)
>>> inverse_melscale_transform = transforms.InverseMelScale(n_stft=1024 // 2 + 1)
@@ -544,7 +548,8 @@ class MelSpectrogram(torch.nn.Module):
mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.MelSpectrogram(sample_rate)
>>> mel_specgram = transform(waveform) # (channel, n_mels, time)
@@ -646,7 +651,8 @@ class MFCC(torch.nn.Module):
melkwargs (dict or None, optional): arguments for MelSpectrogram. (Default: ``None``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.MFCC(
>>> sample_rate=sample_rate,
>>> n_mfcc=13,
@@ -736,7 +742,8 @@ class LFCC(torch.nn.Module):
speckwargs (dict or None, optional): arguments for Spectrogram. (Default: ``None``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.LFCC(
>>> sample_rate=sample_rate,
>>> n_lfcc=13,
@@ -836,7 +843,8 @@ class MuLawEncoding(torch.nn.Module):
quantization_channels (int, optional): Number of channels. (Default: ``256``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = torchaudio.transforms.MuLawEncoding(quantization_channels=512)
>>> mulawtrans = transform(waveform)
@@ -875,7 +883,8 @@ class MuLawDecoding(torch.nn.Module):
quantization_channels (int, optional): Number of channels. (Default: ``256``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = torchaudio.transforms.MuLawDecoding(quantization_channels=512)
>>> mulawtrans = transform(waveform)
"""
@@ -928,7 +937,8 @@ class Resample(torch.nn.Module):
carried out on ``torch.float64``.
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.Resample(sample_rate, sample_rate/10)
>>> waveform = transform(waveform)
"""
@@ -1098,7 +1108,8 @@ class Fade(torch.nn.Module):
(Default: ``"linear"``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.Fade(fade_in_len=sample_rate, fade_out_len=2 * sample_rate, fade_shape="linear")
>>> faded_waveform = transform(waveform)
"""
@@ -1359,7 +1370,9 @@ class Loudness(torch.nn.Module):
sample_rate (int): Sample rate of audio signal.
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>>
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.Loudness(sample_rate)
>>> loudness = transform(waveform)
@@ -1398,7 +1411,9 @@ class Vol(torch.nn.Module):
gain_type (str, optional): Type of gain. One of: ``amplitude``, ``power``, ``db`` (Default: ``amplitude``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>>
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.Vol(gain=0.5, gain_type="amplitude")
>>> quieter_waveform = transform(waveform)
"""
@@ -1448,7 +1463,9 @@ class SlidingWindowCmn(torch.nn.Module):
norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>>
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.SlidingWindowCmn(cmn_window=1000)
>>> cmn_waveform = transform(waveform)
"""
@@ -1528,7 +1545,9 @@ class Vad(torch.nn.Module):
in the detector algorithm. (Default: 2000.0)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>>
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> waveform_reversed, sample_rate = apply_effects_tensor(waveform, sample_rate, [["reverse"]])
>>> transform = transforms.Vad(sample_rate=sample_rate, trigger_level=7.5)
>>> waveform_reversed_front_trim = transform(waveform_reversed)
@@ -1631,7 +1650,9 @@ class SpectralCentroid(torch.nn.Module):
wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>>
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.SpectralCentroid(sample_rate)
>>> spectral_centroid = transform(waveform) # (channel, time)
"""
@@ -1690,7 +1711,9 @@ class PitchShift(LazyModuleMixin, torch.nn.Module):
If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``).
Example
- >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+ >>> from torchaudio.utils import load_torchcodec
+ >>>
+ >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True)
>>> transform = transforms.PitchShift(sample_rate, 4)
>>> waveform_shift = transform(waveform) # (channel, time)
"""
diff --git a/src/torchaudio/utils/__init__.py b/src/torchaudio/utils/__init__.py
index 89bffaa34d..b4c76baf6b 100644
--- a/src/torchaudio/utils/__init__.py
+++ b/src/torchaudio/utils/__init__.py
@@ -2,9 +2,22 @@
from . import sox_utils
from .download import download_asset
+import os
+def load_torchcodec(file, normalize=True, channels_first=True, start_seconds=0.0, stop_seconds=None, **args):
+ if not normalize:
+ raise Exception("Torchcodec does not support non-normalized file reading")
+ try:
+ from torchcodec.decoders import AudioDecoder
+ except:
+ raise Exception("To use this feature, you must install torchcodec. See https://github.com/pytorch/torchcodec for installation instructions")
+ decoder = AudioDecoder(file, **args)
+ samples = decoder.get_samples_played_in_range(start_seconds, stop_seconds)
+ data = samples.data if channels_first else samples.data.T
+ return (data, samples.sample_rate)
__all__ = [
+ "load_torchcodec",
"download_asset",
"sox_utils",
"ffmpeg_utils",
diff --git a/src/torchaudio/utils/ffmpeg_utils.py b/src/torchaudio/utils/ffmpeg_utils.py
index 385596edc1..04358a0494 100644
--- a/src/torchaudio/utils/ffmpeg_utils.py
+++ b/src/torchaudio/utils/ffmpeg_utils.py
@@ -1,6 +1,6 @@
"""Module to change the configuration of FFmpeg libraries (such as libavformat).
-It affects functionalities in :py:mod:`torchaudio.io` (and indirectly :py:func:`torchaudio.load`).
+It affects functionalities in :py:mod:`torchaudio.io` (and indirectly :py:func:`load_torchcodec`).
"""
diff --git a/test/integration_tests/loudness_compliance_test.py b/test/integration_tests/loudness_compliance_test.py
index d9473cfa50..3c28affb54 100644
--- a/test/integration_tests/loudness_compliance_test.py
+++ b/test/integration_tests/loudness_compliance_test.py
@@ -5,6 +5,7 @@
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
import torchaudio.functional as F
@@ -40,7 +41,7 @@ def test_loudness(tmp_path, filename, url, expected):
with zipfile.ZipFile(zippath) as file:
file.extractall(zippath.parent)
- waveform, sample_rate = torchaudio.load(zippath.with_suffix(".wav"))
+ waveform, sample_rate = load_torchcodec(zippath.with_suffix(".wav"))
loudness = F.loudness(waveform, sample_rate)
expected = torch.tensor(expected, dtype=loudness.dtype, device=loudness.device)
assert torch.allclose(loudness, expected, rtol=0.01, atol=0.1)
diff --git a/test/integration_tests/prototype/vggish_pipeline_test.py b/test/integration_tests/prototype/vggish_pipeline_test.py
index 72c6e1e518..25a27b7e10 100644
--- a/test/integration_tests/prototype/vggish_pipeline_test.py
+++ b/test/integration_tests/prototype/vggish_pipeline_test.py
@@ -1,4 +1,5 @@
import torchaudio
+from torchaudio.utils import load_torchcodec
from torchaudio.prototype.pipelines import VGGISH
@@ -7,7 +8,7 @@ def test_vggish():
input_proc = VGGISH.get_input_processor()
model = VGGISH.get_model()
path = torchaudio.utils.download_asset("test-assets/Chopin_Ballade_-1_In_G_Minor,_Op._23_excerpt.mp3")
- waveform, sr = torchaudio.load(path, backend="ffmpeg")
+ waveform, sr = load_torchcodec(path, backend="ffmpeg")
waveform = waveform.mean(axis=0)
waveform = torchaudio.functional.resample(waveform, sr, input_sr)
batch = input_proc(waveform)
diff --git a/test/integration_tests/rnnt_pipeline_test.py b/test/integration_tests/rnnt_pipeline_test.py
index 6827d27d46..fbcce60f6d 100644
--- a/test/integration_tests/rnnt_pipeline_test.py
+++ b/test/integration_tests/rnnt_pipeline_test.py
@@ -1,5 +1,6 @@
import pytest
import torchaudio
+from torchaudio.utils import load_torchcodec
from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
from torchaudio.prototype.pipelines import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3
@@ -16,7 +17,7 @@ def test_rnnt(bundle, sample_speech, expected):
feature_extractor = bundle.get_feature_extractor()
decoder = bundle.get_decoder().eval()
token_processor = bundle.get_token_processor()
- waveform, _ = torchaudio.load(sample_speech)
+ waveform, _ = load_torchcodec(sample_speech)
features, length = feature_extractor(waveform.squeeze())
hypotheses = decoder(features, length, 10)
text = token_processor(hypotheses[0][0])
diff --git a/test/integration_tests/source_separation_pipeline_test.py b/test/integration_tests/source_separation_pipeline_test.py
index 7507958400..c56683dcc0 100644
--- a/test/integration_tests/source_separation_pipeline_test.py
+++ b/test/integration_tests/source_separation_pipeline_test.py
@@ -4,6 +4,7 @@
import pytest
import torch
import torchaudio
+from torchaudio.utils import load_torchcodec
from torchaudio.pipelines import CONVTASNET_BASE_LIBRI2MIX, HDEMUCS_HIGH_MUSDB, HDEMUCS_HIGH_MUSDB_PLUS
@@ -27,11 +28,11 @@ def test_source_separation_models(bundle, task, channel, expected_score, mixture
Si-SDR score should be equal to or larger than the expected score.
"""
model = bundle.get_model()
- mixture_waveform, sample_rate = torchaudio.load(mixture_source)
+ mixture_waveform, sample_rate = load_torchcodec(mixture_source)
assert sample_rate == bundle.sample_rate, "The sample rate of audio must match that in the bundle."
clean_waveforms = []
for source in clean_sources:
- clean_waveform, sample_rate = torchaudio.load(source)
+ clean_waveform, sample_rate = load_torchcodec(source)
assert sample_rate == bundle.sample_rate, "The sample rate of audio must match that in the bundle."
clean_waveforms.append(clean_waveform)
mixture_waveform = mixture_waveform.reshape(1, channel, -1)
diff --git a/test/integration_tests/squim_pipeline_test.py b/test/integration_tests/squim_pipeline_test.py
index 9f78bba4d4..c8b21a14d5 100644
--- a/test/integration_tests/squim_pipeline_test.py
+++ b/test/integration_tests/squim_pipeline_test.py
@@ -1,5 +1,6 @@
import pytest
import torchaudio
+from torchaudio.utils import load_torchcodec
from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE
@@ -16,7 +17,7 @@ def test_squim_objective_pretrained_weights(lang, expected, sample_speech):
# Get SquimObjective model
model = bundle.get_model()
# Create a synthetic waveform
- waveform, sample_rate = torchaudio.load(sample_speech)
+ waveform, sample_rate = load_torchcodec(sample_speech)
scores = model(waveform)
for i in range(3):
assert abs(scores[i].item() - expected[i]) < 1e-5
@@ -35,9 +36,9 @@ def test_squim_subjective_pretrained_weights(task, expected, mixture_source, cle
# Get SquimObjective model
model = bundle.get_model()
# Load input mixture audio
- waveform, sample_rate = torchaudio.load(mixture_source)
+ waveform, sample_rate = load_torchcodec(mixture_source)
for i, source in enumerate(clean_sources):
# Load clean reference
- clean_waveform, sample_rate = torchaudio.load(source)
+ clean_waveform, sample_rate = load_torchcodec(source)
score = model(waveform, clean_waveform)
assert abs(score.item() - expected[i]) < 1e-5
diff --git a/test/integration_tests/wav2vec2_pipeline_test.py b/test/integration_tests/wav2vec2_pipeline_test.py
index c863ea3688..a6489169b1 100644
--- a/test/integration_tests/wav2vec2_pipeline_test.py
+++ b/test/integration_tests/wav2vec2_pipeline_test.py
@@ -2,6 +2,7 @@
import pytest
import torchaudio
+from torchaudio.utils import load_torchcodec
from torchaudio.pipelines import (
HUBERT_ASR_LARGE,
HUBERT_ASR_XLARGE,
@@ -113,7 +114,7 @@ def test_finetune_asr_model(
):
"""Smoke test of downloading weights for fine-tuning models and simple transcription"""
model = bundle.get_model().eval()
- waveform, sample_rate = torchaudio.load(sample_speech)
+ waveform, sample_rate = load_torchcodec(sample_speech)
emission, _ = model(waveform)
decoder = ctc_decoder(bundle.get_labels())
result = decoder(emission[0])
diff --git a/test/torchaudio_unittest/conftest.py b/test/torchaudio_unittest/conftest.py
new file mode 100644
index 0000000000..0a20827ade
--- /dev/null
+++ b/test/torchaudio_unittest/conftest.py
@@ -0,0 +1,14 @@
+import pytest
+import os
+
+
+def pytest_collection_modifyitems(config, items):
+ fail_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "ffmpeg_fail_ids.txt")
+ with open(fail_path, 'r') as file:
+ fail_ids = set([f.strip() for f in file.readlines()])
+
+ skip_marker = pytest.mark.skip(reason="FFMPEG incompatible with CI runner")
+
+ for item in items:
+ if item.nodeid in fail_ids:
+ item.add_marker(skip_marker)
diff --git a/test/torchaudio_unittest/ffmpeg_fail_ids.txt b/test/torchaudio_unittest/ffmpeg_fail_ids.txt
new file mode 100644
index 0000000000..50bd062384
--- /dev/null
+++ b/test/torchaudio_unittest/ffmpeg_fail_ids.txt
@@ -0,0 +1,228 @@
+test/torchaudio_unittest/datasets/cmuarctic_test.py::TestCMUARCTIC::test_cmuarctic_path
+test/torchaudio_unittest/datasets/cmuarctic_test.py::TestCMUARCTIC::test_cmuarctic_str
+test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceN::test_commonvoice_path
+test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceN::test_commonvoice_str
+test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceFR::test_commonvoice_str
+test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_test_path
+test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_test_str
+test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_train_path
+test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_train_str
+test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsTest
+test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsTrain
+test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsValid
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_no_subset
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_testing_path
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_testing_str
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_training_path
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_training_str
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_validation_path
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_validation_str
+test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPFullDataset
+test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPImprovisedDataset
+test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPScriptedDataset
+test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_10h
+test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_10min
+test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_1h
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_0_sep_clean
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_1_enh_single
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_2_enh_both
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_3_sep_noisy
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_0_sep_clean
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_1_enh_single
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_2_enh_both
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_3_sep_noisy
+test/torchaudio_unittest/datasets/librispeech_test.py::TestLibriSpeech::test_librispeech_path
+test/torchaudio_unittest/datasets/librispeech_test.py::TestLibriSpeech::test_librispeech_str
+test/torchaudio_unittest/datasets/libritts_test.py::TestLibriTTS::test_libritts_path
+test/torchaudio_unittest/datasets/libritts_test.py::TestLibriTTS::test_libritts_str
+test/torchaudio_unittest/datasets/ljspeech_test.py::TestLJSpeech::test_ljspeech_path
+test/torchaudio_unittest/datasets/ljspeech_test.py::TestLJSpeech::test_ljspeech_str
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_0
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_1
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_2
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_3
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_4
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_5
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_6
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_0
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_1
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_2
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_3
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_4
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_5
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_6
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_0
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_1
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_2
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_3
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_4
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_5
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_6
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_0
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_1
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_2
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_3
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_4
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_5
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_6
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_0_albanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_1_basque
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_2_czech
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_3_nnenglish
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_4_romanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_5_slovak
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_0_albanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_1_basque
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_2_czech
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_3_nnenglish
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_4_romanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_5_slovak
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_0_albanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_1_basque
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_2_czech
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_3_nnenglish
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_4_romanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_5_slovak
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14SubsetDev
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14SubsetDocs
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14Subsetval
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14SubsetEval
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_5_slovak
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_4_romanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_3_nnenglish
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_2_czech
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_1_basque
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_0_albanian
+test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceEN::test_commonvoice_path
+test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceEN::test_commonvoice_str
+test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIEMOCAPFullDataset
+test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIEMOCAPImprovisedDataset
+test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIEMOCAPScriptedDataset
+test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsTest
+test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsTrain
+test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsValid
+test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetTest
+test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetTrain
+test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetValid
+test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommands_path
+test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommands_str
+test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release1_path
+test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release1_str
+test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release2
+test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release3
+test/torchaudio_unittest/datasets/vctk_test.py::TestVCTK::test_vctk_path
+test/torchaudio_unittest/datasets/vctk_test.py::TestVCTK::test_vctk_str
+test/torchaudio_unittest/datasets/voxceleb1_test.py::TestVoxCeleb1Identification::testVoxCeleb1SubsetTrain
+test/torchaudio_unittest/datasets/voxceleb1_test.py::TestVoxCeleb1Verification::testVoxCeleb1Verification
+test/torchaudio_unittest/datasets/yesno_test.py::TestYesNo::test_yesno_path
+test/torchaudio_unittest/datasets/yesno_test.py::TestYesNo::test_yesno_str
+test/torchaudio_unittest/example/souce_sepration/wsj0mix_test.py::TestWSJ0Mix2::test_wsj0mix
+test/torchaudio_unittest/example/souce_sepration/wsj0mix_test.py::TestWSJ0Mix3::test_wsj0mix
+test/torchaudio_unittest/datasets/cmuarctic_test.py::TestCMUARCTIC::test_cmuarctic_path
+test/torchaudio_unittest/datasets/cmuarctic_test.py::TestCMUARCTIC::test_cmuarctic_str
+test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceN::test_commonvoice_path
+test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceN::test_commonvoice_str
+test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceFR::test_commonvoice_str
+test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_test_path
+test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_test_str
+test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_train_path
+test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_train_str
+test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsTest
+test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsTrain
+test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsValid
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_no_subset
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_testing_path
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_testing_str
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_training_path
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_training_str
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_validation_path
+test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_validation_str
+test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPFullDataset
+test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPImprovisedDataset
+test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPScriptedDataset
+test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_10h
+test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_10min
+test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_1h
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_0_sep_clean
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_1_enh_single
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_2_enh_both
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_3_sep_noisy
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_0_sep_clean
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_1_enh_single
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_2_enh_both
+test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_3_sep_noisy
+test/torchaudio_unittest/datasets/librispeech_test.py::TestLibriSpeech::test_librispeech_path
+test/torchaudio_unittest/datasets/librispeech_test.py::TestLibriSpeech::test_librispeech_str
+test/torchaudio_unittest/datasets/libritts_test.py::TestLibriTTS::test_libritts_path
+test/torchaudio_unittest/datasets/libritts_test.py::TestLibriTTS::test_libritts_str
+test/torchaudio_unittest/datasets/ljspeech_test.py::TestLJSpeech::test_ljspeech_path
+test/torchaudio_unittest/datasets/ljspeech_test.py::TestLJSpeech::test_ljspeech_str
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_0
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_1
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_2
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_3
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_4
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_5
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_6
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_0
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_1
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_2
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_3
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_4
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_5
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_6
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_0
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_1
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_2
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_3
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_4
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_5
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_6
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_0
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_1
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_2
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_3
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_4
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_5
+test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_6
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_0_albanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_1_basque
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_2_czech
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_3_nnenglish
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_4_romanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_5_slovak
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_0_albanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_1_basque
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_2_czech
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_3_nnenglish
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_4_romanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_5_slovak
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_0_albanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_1_basque
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_2_czech
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_3_nnenglish
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_4_romanian
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_5_slovak
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14SubsetDev
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14SubsetDocs
+test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14Subsetval
+test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsTest
+test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsTrain
+test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsValid
+test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetTest
+test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetTrain
+test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetValid
+test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommands_path
+test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommands_str
+test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release1_path
+test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release1_str
+test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release2
+test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release3
+test/torchaudio_unittest/datasets/vctk_test.py::TestVCTK::test_vctk_path
+test/torchaudio_unittest/datasets/vctk_test.py::TestVCTK::test_vctk_str
+test/torchaudio_unittest/datasets/voxceleb1_test.py::TestVoxCeleb1Identification::testVoxCeleb1SubsetTrain
+test/torchaudio_unittest/datasets/voxceleb1_test.py::TestVoxCeleb1Verification::testVoxCeleb1Verification
+test/torchaudio_unittest/datasets/yesno_test.py::TestYesNo::test_yesno_path
+test/torchaudio_unittest/datasets/yesno_test.py::TestYesNo::test_yesno_str
+test/torchaudio_unittest/example/souce_sepration/wsj0mix_test.py::TestWSJ0Mix2::test_wsj0mix
+test/torchaudio_unittest/example/souce_sepration/wsj0mix_test.py::TestWSJ0Mix3::test_wsj0mix