diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 8859b827f0..9170f45a01 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -74,7 +74,7 @@ case $GPU_ARCH_TYPE in ;; esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" -pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" +pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" # 2. Install torchaudio @@ -85,6 +85,9 @@ export BUILD_CPP_TEST=1 python setup.py install # 3. Install Test tools +conda install -y "ffmpeg<5" +python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" + printf "* Installing test tools\n" NUMBA_DEV_CHANNEL="" if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then @@ -94,7 +97,7 @@ if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then fi ( set -x - conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} sox libvorbis parameterized 'requests>=2.20' 'ffmpeg>=6,<7' + conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} sox libvorbis parameterized 'requests>=2.20' pip install kaldi-io SoundFile librosa coverage pytest pytest-cov scipy expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm # TODO: might be better to fix the single call to `pip install` above diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index e92c556218..f681e3b7ec 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -68,7 +68,7 @@ jobs: GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version. PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" - pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" + pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" echo "::endgroup::" echo "::group::Install TorchAudio" diff --git a/docs/requirements.txt b/docs/requirements.txt index 8522161f40..485690e036 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,7 @@ Jinja2<3.1.0 matplotlib<=3.8 pyparsing<3,>=2.0.2 +torchcodec # C++ docs breathe==4.34.0 diff --git a/docs/source/index.rst b/docs/source/index.rst index bee740a167..cb74f4e957 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -182,7 +182,7 @@ Tutorials .. customcarditem:: :header: Loading waveform Tensors from files and saving them - :card_description: Learn how to query/load audio files and save waveform tensors to files, using torchaudio.info, torchaudio.load and torchaudio.save functions. + :card_description: Learn how to query/load audio files and save waveform tensors to files, using torchaudio.info, torchaudio.utils.load_torchcodec and torchaudio.save functions. :image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/audio_io_tutorial.png :link: tutorials/audio_io_tutorial.html :tags: I/O @@ -399,7 +399,7 @@ In BibTeX format: .. code-block:: bibtex @misc{hwang2023torchaudio, - title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch}, + title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch}, author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis}, year={2023}, eprint={2310.17864}, diff --git a/examples/asr/emformer_rnnt/mustc/dataset.py b/examples/asr/emformer_rnnt/mustc/dataset.py index 7417aec164..7628fa2630 100644 --- a/examples/asr/emformer_rnnt/mustc/dataset.py +++ b/examples/asr/emformer_rnnt/mustc/dataset.py @@ -4,6 +4,7 @@ import torch import torchaudio import yaml +from torchaudio.utils import load_torchcodec FOLDER_IN_ARCHIVE = "en-de" @@ -31,15 +32,15 @@ def __init__( self.idx_target_lengths = [] self.wav_list = [] for idx, item in enumerate(file_list): - offset = int(item["offset"] * SAMPLE_RATE) - duration = int(item["duration"] * SAMPLE_RATE) + offset = item["offset"] + duration = item["duration"] self.idx_target_lengths.append((idx, item["duration"])) file_path = wav_dir / item["wav"] self.wav_list.append((file_path, offset, duration)) def _get_mustc_item(self, idx): file_path, offset, duration = self.wav_list[idx] - waveform, sr = torchaudio.load(file_path, frame_offset=offset, num_frames=duration) + waveform, sr = load_torchcodec(file_path, start_seconds=offset, stop_seconds=offset + duration) assert sr == SAMPLE_RATE transcript = self.trans_list[idx].replace("\n", "") return (waveform, transcript) diff --git a/examples/avsr/data_prep/data/data_module.py b/examples/avsr/data_prep/data/data_module.py index 542e26147a..3df611f2f8 100644 --- a/examples/avsr/data_prep/data/data_module.py +++ b/examples/avsr/data_prep/data/data_module.py @@ -7,7 +7,7 @@ import torch import torchaudio import torchvision - +from torchaudio.utils import load_torchcodec class AVSRDataLoader: def __init__(self, modality, detector="retinaface", resize=None): @@ -39,7 +39,7 @@ def load_data(self, data_filename, transform=True): return video def load_audio(self, data_filename): - waveform, sample_rate = torchaudio.load(data_filename, normalize=True) + waveform, sample_rate = load_torchcodec(data_filename, normalize=True) return waveform, sample_rate def load_video(self, data_filename): diff --git a/examples/avsr/lrs3.py b/examples/avsr/lrs3.py index b58d96a061..57a77872f7 100644 --- a/examples/avsr/lrs3.py +++ b/examples/avsr/lrs3.py @@ -3,6 +3,7 @@ import torchaudio import torchvision from torch.utils.data import Dataset +from torchaudio.utils import load_torchcodec def _load_list(args, *filenames): @@ -31,7 +32,7 @@ def load_audio(path): """ rtype: torch, T x 1 """ - waveform, sample_rate = torchaudio.load(path, normalize=True) + waveform, sample_rate = load_torchcodec(path, normalize=True) return waveform.transpose(1, 0) diff --git a/examples/dnn_beamformer/datamodule.py b/examples/dnn_beamformer/datamodule.py index e6f81cbda2..fe82f96e08 100644 --- a/examples/dnn_beamformer/datamodule.py +++ b/examples/dnn_beamformer/datamodule.py @@ -8,6 +8,7 @@ from torch import Tensor from torch.utils.data import Dataset from utils import CollateFnL3DAS22 +from torchaudio.utils import load_torchcodec _PREFIX = "L3DAS22_Task1_" _SUBSETS = { @@ -46,10 +47,10 @@ def __getitem__(self, n: int) -> Tuple[Tensor, Tensor, int, str]: noisy_path_B = str(noisy_path_A).replace("_A.wav", "_B.wav") clean_path = noisy_path_A.parent.parent / "labels" / noisy_path_A.name.replace("_A.wav", ".wav") transcript_path = str(clean_path).replace("wav", "txt") - waveform_noisy_A, sample_rate1 = torchaudio.load(noisy_path_A) - waveform_noisy_B, sample_rate2 = torchaudio.load(noisy_path_B) + waveform_noisy_A, sample_rate1 = load_torchcodec(noisy_path_A) + waveform_noisy_B, sample_rate2 = load_torchcodec(noisy_path_B) waveform_noisy = torch.cat((waveform_noisy_A, waveform_noisy_B), dim=0) - waveform_clean, sample_rate3 = torchaudio.load(clean_path) + waveform_clean, sample_rate3 = load_torchcodec(clean_path) assert sample_rate1 == _SAMPLE_RATE and sample_rate2 == _SAMPLE_RATE and sample_rate3 == _SAMPLE_RATE with open(transcript_path, "r") as f: transcript = f.readline() diff --git a/examples/hubert/dataset/hubert_dataset.py b/examples/hubert/dataset/hubert_dataset.py index 3670628fa1..967967f549 100644 --- a/examples/hubert/dataset/hubert_dataset.py +++ b/examples/hubert/dataset/hubert_dataset.py @@ -12,6 +12,9 @@ from torch import Tensor from torch.utils.data import BatchSampler, Dataset, DistributedSampler +from torchaudio.utils import load_torchcodec + + sys.path.append("..") from utils import _get_label2id @@ -299,7 +302,7 @@ def _load_audio(self, index: int) -> Tensor: (Tensor): The corresponding waveform Tensor. """ wav_path = self.f_list[index] - waveform, sample_rate = torchaudio.load(wav_path) + waveform, sample_rate = load_torchcodec(wav_path) assert waveform.shape[1] == self.len_list[index] return waveform diff --git a/examples/hubert/utils/feature_utils.py b/examples/hubert/utils/feature_utils.py index 534d4f10fe..918d7cfcd5 100644 --- a/examples/hubert/utils/feature_utils.py +++ b/examples/hubert/utils/feature_utils.py @@ -13,6 +13,7 @@ from torch.nn import Module from .common_utils import _get_feat_lens_paths +from torchaudio.utils import load_torchcodec _LG = logging.getLogger(__name__) _DEFAULT_DEVICE = torch.device("cpu") @@ -53,7 +54,7 @@ def extract_feature_mfcc( Returns: Tensor: The desired feature tensor of the given audio file. """ - waveform, sr = torchaudio.load(path) + waveform, sr = load_torchcodec(path) assert sr == sample_rate feature_extractor = torchaudio.transforms.MFCC( sample_rate=sample_rate, n_mfcc=13, melkwargs={"n_fft": 400, "hop_length": 160, "center": False} @@ -88,7 +89,7 @@ def extract_feature_hubert( Returns: Tensor: The desired feature tensor of the given audio file. """ - waveform, sr = torchaudio.load(path) + waveform, sr = load_torchcodec(path) assert sr == sample_rate waveform = waveform.to(device) with torch.inference_mode(): diff --git a/examples/libtorchaudio/augmentation/create_jittable_pipeline.py b/examples/libtorchaudio/augmentation/create_jittable_pipeline.py index 79f56819fc..b050de04d4 100755 --- a/examples/libtorchaudio/augmentation/create_jittable_pipeline.py +++ b/examples/libtorchaudio/augmentation/create_jittable_pipeline.py @@ -7,7 +7,7 @@ import torch import torchaudio - +from torchaudio.utils import load_torchcodec class Pipeline(torch.nn.Module): """Example audio process pipeline. @@ -17,7 +17,7 @@ class Pipeline(torch.nn.Module): def __init__(self, rir_path: str): super().__init__() - rir, sample_rate = torchaudio.load(rir_path) + rir, sample_rate = load_torchcodec(rir_path) self.register_buffer("rir", rir) self.rir_sample_rate: int = sample_rate @@ -25,7 +25,7 @@ def forward(self, input_path: str, output_path: str): torchaudio.sox_effects.init_sox_effects() # 1. load audio - waveform, sample_rate = torchaudio.load(input_path) + waveform, sample_rate = load_torchcodec(input_path) # 2. Add background noise alpha = 0.01 diff --git a/examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py b/examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py index dcbe3c011a..9a175601f6 100644 --- a/examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py +++ b/examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py @@ -14,6 +14,7 @@ from greedy_decoder import Decoder from torch.utils.mobile_optimizer import optimize_for_mobile from torchaudio.models.wav2vec2.utils.import_fairseq import import_fairseq_model +from torchaudio.utils import load_torchcodec TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2]) if TORCH_VERSION >= (1, 10): @@ -58,7 +59,7 @@ def _parse_args(): class Loader(torch.nn.Module): def forward(self, audio_path: str) -> torch.Tensor: - waveform, sample_rate = torchaudio.load(audio_path) + waveform, sample_rate = load_torchcodec(audio_path) if sample_rate != 16000: waveform = torchaudio.functional.resample(waveform, float(sample_rate), 16000.0) return waveform diff --git a/examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py b/examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py index 344d3d09a2..6e0b05b1df 100644 --- a/examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py +++ b/examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py @@ -8,6 +8,7 @@ import torchaudio from greedy_decoder import Decoder from torchaudio.models.wav2vec2.utils.import_huggingface import import_huggingface_model +from torchaudio.utils import load_torchcodec TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2]) if TORCH_VERSION >= (1, 10): @@ -49,7 +50,7 @@ def _parse_args(): class Loader(torch.nn.Module): def forward(self, audio_path: str) -> torch.Tensor: - waveform, sample_rate = torchaudio.load(audio_path) + waveform, sample_rate = load_torchcodec(audio_path) if sample_rate != 16000: waveform = torchaudio.functional.resample(waveform, float(sample_rate), 16000.0) return waveform diff --git a/examples/self_supervised_learning/data_modules/_utils.py b/examples/self_supervised_learning/data_modules/_utils.py index 0333ca605d..b63eb77a43 100644 --- a/examples/self_supervised_learning/data_modules/_utils.py +++ b/examples/self_supervised_learning/data_modules/_utils.py @@ -8,6 +8,7 @@ import torchaudio from torch import Tensor from torch.utils.data import BatchSampler, Dataset, DistributedSampler +from torchaudio.utils import load_torchcodec from ..lightning_modules import Batch @@ -295,7 +296,7 @@ def _load_audio(self, index: int) -> Tensor: (Tensor): The corresponding waveform Tensor. """ wav_path = self.f_list[index] - waveform, sample_rate = torchaudio.load(wav_path) + waveform, sample_rate = load_torchcodec(wav_path) assert waveform.shape[1] == self.len_list[index] return waveform diff --git a/examples/source_separation/utils/dataset/wsj0mix.py b/examples/source_separation/utils/dataset/wsj0mix.py index 3d3c5f826d..8846ce3f42 100644 --- a/examples/source_separation/utils/dataset/wsj0mix.py +++ b/examples/source_separation/utils/dataset/wsj0mix.py @@ -4,6 +4,7 @@ import torch import torchaudio from torch.utils.data import Dataset +from torchaudio.utils import load_torchcodec SampleType = Tuple[int, torch.Tensor, List[torch.Tensor]] @@ -37,7 +38,7 @@ def __init__( self.files.sort() def _load_audio(self, path) -> torch.Tensor: - waveform, sample_rate = torchaudio.load(path) + waveform, sample_rate = load_torchcodec(path) if sample_rate != self.sample_rate: raise ValueError( f"The dataset contains audio file of sample rate {sample_rate}, " diff --git a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py index 624cd8066a..775492a53c 100644 --- a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py +++ b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py @@ -65,6 +65,7 @@ import matplotlib.pyplot as plt from torchaudio.models.decoder import ctc_decoder from torchaudio.utils import download_asset +from torchaudio.utils import load_torchcodec ###################################################################### # @@ -98,7 +99,7 @@ # i really was very much afraid of showing him how much shocked i was at some parts of what he said # -waveform, sample_rate = torchaudio.load(speech_file) +waveform, sample_rate = load_torchcodec(speech_file) if sample_rate != bundle.sample_rate: waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate) diff --git a/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py index 8329d8a40e..ae17513c35 100755 --- a/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py +++ b/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py @@ -54,6 +54,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -96,7 +97,7 @@ def download_asset_external(url, key): # speech_file = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav") -waveform, sample_rate = torchaudio.load(speech_file) +waveform, sample_rate = load_torchcodec(speech_file) assert sample_rate == 16000 IPython.display.Audio(speech_file) diff --git a/examples/tutorials/audio_data_augmentation_tutorial.py b/examples/tutorials/audio_data_augmentation_tutorial.py index 734cb57bb4..7b3bc6042d 100644 --- a/examples/tutorials/audio_data_augmentation_tutorial.py +++ b/examples/tutorials/audio_data_augmentation_tutorial.py @@ -15,6 +15,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec import torchaudio.functional as F print(torch.__version__) @@ -52,7 +53,7 @@ # # Load the data -waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False) +waveform1, sample_rate = load_torchcodec(SAMPLE_WAV, channels_first=False) # Define effects effect = ",".join( @@ -159,7 +160,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): # and clap your hands. # -rir_raw, sample_rate = torchaudio.load(SAMPLE_RIR) +rir_raw, sample_rate = load_torchcodec(SAMPLE_RIR) plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)") plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)") Audio(rir_raw, rate=sample_rate) @@ -179,7 +180,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): # we convolve the speech signal with the RIR. # -speech, _ = torchaudio.load(SAMPLE_SPEECH) +speech, _ = load_torchcodec(SAMPLE_SPEECH) augmented = F.fftconvolve(speech, rir) ###################################################################### @@ -219,8 +220,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): # To add noise to audio data per SNRs, we # use :py:func:`torchaudio.functional.add_noise`. -speech, _ = torchaudio.load(SAMPLE_SPEECH) -noise, _ = torchaudio.load(SAMPLE_NOISE) +speech, _ = load_torchcodec(SAMPLE_SPEECH) +noise, _ = load_torchcodec(SAMPLE_NOISE) noise = noise[:, : speech.shape[1]] snr_dbs = torch.tensor([20, 10, 3]) @@ -275,7 +276,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): # a Tensor object. # -waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False) +waveform, sample_rate = load_torchcodec(SAMPLE_SPEECH, channels_first=False) def apply_codec(waveform, sample_rate, format, encoder=None): @@ -332,7 +333,7 @@ def apply_codec(waveform, sample_rate, format, encoder=None): # sample_rate = 16000 -original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH) +original_speech, sample_rate = load_torchcodec(SAMPLE_SPEECH) plot_specgram(original_speech, sample_rate, title="Original") @@ -345,7 +346,7 @@ def apply_codec(waveform, sample_rate, format, encoder=None): # Because the noise is recorded in the actual environment, we consider that # the noise contains the acoustic feature of the environment. Therefore, we add # the noise after RIR application. -noise, _ = torchaudio.load(SAMPLE_NOISE) +noise, _ = load_torchcodec(SAMPLE_NOISE) noise = noise[:, : rir_applied.shape[1]] snr_db = torch.tensor([8]) diff --git a/examples/tutorials/audio_feature_extractions_tutorial.py b/examples/tutorials/audio_feature_extractions_tutorial.py index eb43c6dca8..7b81333e1c 100644 --- a/examples/tutorials/audio_feature_extractions_tutorial.py +++ b/examples/tutorials/audio_feature_extractions_tutorial.py @@ -21,6 +21,7 @@ import torchaudio import torchaudio.functional as F import torchaudio.transforms as T +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -103,7 +104,7 @@ def plot_fbank(fbank, title=None): # # Load audio -SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH) +SPEECH_WAVEFORM, SAMPLE_RATE = load_torchcodec(SAMPLE_SPEECH) # Define transform spectrogram = T.Spectrogram(n_fft=512) diff --git a/examples/tutorials/audio_io_tutorial.py b/examples/tutorials/audio_io_tutorial.py index ddcd931f62..daf6cd20ef 100644 --- a/examples/tutorials/audio_io_tutorial.py +++ b/examples/tutorials/audio_io_tutorial.py @@ -22,6 +22,8 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec +from io import BytesIO print(torch.__version__) print(torchaudio.__version__) @@ -151,7 +153,7 @@ def read(self, n): # Loading audio data # ------------------ # -# To load audio data, you can use :py:func:`torchaudio.load`. +# To load audio data, you can use :py:func:`load_torchcodec`. # # This function accepts a path-like object or file-like object as input. # @@ -165,7 +167,7 @@ def read(self, n): # documentation `__. # -waveform, sample_rate = torchaudio.load(SAMPLE_WAV) +waveform, sample_rate = load_torchcodec(SAMPLE_WAV) ###################################################################### @@ -219,10 +221,10 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"): Audio(waveform.numpy()[0], rate=sample_rate) ###################################################################### -# Loading from file-like object +# Loading from URLs and file-like object # ----------------------------- # -# The I/O functions support file-like objects. +# The I/O functions support URLs and file-like objects. # This allows for fetching and decoding audio data from locations # within and beyond the local file system. # The following examples illustrate this. @@ -231,10 +233,9 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"): ###################################################################### # -# Load audio data as HTTP request +# Load audio data from an HTTP request url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" -with requests.get(url, stream=True) as response: - waveform, sample_rate = torchaudio.load(_hide_seek(response.raw)) +waveform, sample_rate = load_torchcodec(url) plot_specgram(waveform, sample_rate, title="HTTP datasource") ###################################################################### @@ -245,7 +246,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"): tar_item = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" with tarfile.open(tar_path, mode="r") as tarfile_: fileobj = tarfile_.extractfile(tar_item) - waveform, sample_rate = torchaudio.load(fileobj) + waveform, sample_rate = load_torchcodec(fileobj) plot_specgram(waveform, sample_rate, title="TAR file") ###################################################################### @@ -256,7 +257,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"): key = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" client = boto3.client("s3", config=Config(signature_version=UNSIGNED)) response = client.get_object(Bucket=bucket, Key=key) -waveform, sample_rate = torchaudio.load(_hide_seek(response["Body"])) +waveform, sample_rate = load_torchcodec(BytesIO(response['Body'].read())) plot_specgram(waveform, sample_rate, title="From S3") @@ -289,17 +290,13 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"): url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" print("Fetching all the data...") -with requests.get(url, stream=True) as response: - waveform1, sample_rate1 = torchaudio.load(_hide_seek(response.raw)) - waveform1 = waveform1[:, frame_offset : frame_offset + num_frames] - print(f" - Fetched {response.raw.tell()} bytes") +waveform1, sample_rate1 = load_torchcodec(url) +waveform1 = waveform1[:, frame_offset : frame_offset + num_frames] print("Fetching until the requested frames are available...") -with requests.get(url, stream=True) as response: - waveform2, sample_rate2 = torchaudio.load( - _hide_seek(response.raw), frame_offset=frame_offset, num_frames=num_frames - ) - print(f" - Fetched {response.raw.tell()} bytes") +waveform2, sample_rate2 = load_torchcodec( + url, start_seconds=1, stop_seconds=2 +) print("Checking the resulting waveform ... ", end="") assert (waveform1 == waveform2).all() @@ -331,7 +328,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"): # resulting file size but also precision. # -waveform, sample_rate = torchaudio.load(SAMPLE_WAV) +waveform, sample_rate = load_torchcodec(SAMPLE_WAV) ###################################################################### @@ -383,7 +380,7 @@ def inspect_file(path): ###################################################################### # -waveform, sample_rate = torchaudio.load(SAMPLE_WAV_8000) +waveform, sample_rate = load_torchcodec(SAMPLE_WAV_8000) with tempfile.TemporaryDirectory() as tempdir: for format in formats: path = f"{tempdir}/save_example.{format}" @@ -400,7 +397,7 @@ def inspect_file(path): # -waveform, sample_rate = torchaudio.load(SAMPLE_WAV) +waveform, sample_rate = load_torchcodec(SAMPLE_WAV) # Saving to bytes buffer buffer_ = io.BytesIO() diff --git a/examples/tutorials/ctc_forced_alignment_api_tutorial.py b/examples/tutorials/ctc_forced_alignment_api_tutorial.py index 789fa3cf85..610ccc9abc 100644 --- a/examples/tutorials/ctc_forced_alignment_api_tutorial.py +++ b/examples/tutorials/ctc_forced_alignment_api_tutorial.py @@ -39,6 +39,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -63,7 +64,7 @@ # SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") -waveform, _ = torchaudio.load(SPEECH_FILE) +waveform, _ = load_torchcodec(SPEECH_FILE) TRANSCRIPT = "i had that curiosity beside me at this moment".split() diff --git a/examples/tutorials/effector_tutorial.py b/examples/tutorials/effector_tutorial.py index 8eadcf6ef4..dffa35e893 100644 --- a/examples/tutorials/effector_tutorial.py +++ b/examples/tutorials/effector_tutorial.py @@ -43,6 +43,7 @@ # import torch import torchaudio +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -92,7 +93,7 @@ # src = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") -waveform, sr = torchaudio.load(src, channels_first=False) +waveform, sr = load_torchcodec(src, channels_first=False) ###################################################################### diff --git a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py index 00dfe68b9d..aa21a6076a 100644 --- a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py +++ b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py @@ -26,6 +26,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -244,9 +245,8 @@ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sam text_normalized = "aber seit ich bei ihnen das brot hole" url = "https://download.pytorch.org/torchaudio/tutorial-assets/10349_8674_000087.flac" -waveform, sample_rate = torchaudio.load( - url, frame_offset=int(0.5 * bundle.sample_rate), num_frames=int(2.5 * bundle.sample_rate) -) +waveform, sample_rate = load_torchcodec( + url, start_seconds=0.5, stop_seconds=3) ###################################################################### # @@ -326,7 +326,7 @@ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sam # url = "https://download.pytorch.org/torchaudio/tutorial-assets/mvdr/clean_speech.wav" -waveform, sample_rate = torchaudio.load(url) +waveform, sample_rate = load_torchcodec(url) waveform = waveform[0:1] ###################################################################### @@ -400,7 +400,7 @@ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sam text_normalized = "wtedy ujrzalem na jego brzuchu okragla czarna rane" url = "https://download.pytorch.org/torchaudio/tutorial-assets/5090_1447_000088.flac" -waveform, sample_rate = torchaudio.load(url, num_frames=int(4.5 * bundle.sample_rate)) +waveform, sample_rate = load_torchcodec(url, stop_seconds=4.5) ###################################################################### # @@ -467,9 +467,7 @@ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sam text_normalized = "na imensa extensao onde se esconde o inconsciente imortal" url = "https://download.pytorch.org/torchaudio/tutorial-assets/6566_5323_000027.flac" -waveform, sample_rate = torchaudio.load( - url, frame_offset=int(bundle.sample_rate), num_frames=int(4.6 * bundle.sample_rate) -) +waveform, sample_rate = load_torchcodec(url, start_seconds=1, stop_seconds=4.6) ###################################################################### # @@ -542,7 +540,7 @@ def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sam text_normalized = "elle giacean per terra tutte quante" url = "https://download.pytorch.org/torchaudio/tutorial-assets/642_529_000025.flac" -waveform, sample_rate = torchaudio.load(url, num_frames=int(4 * bundle.sample_rate)) +waveform, sample_rate = load_torchcodec(url, stop_seconds=4) ###################################################################### # diff --git a/examples/tutorials/forced_alignment_tutorial.py b/examples/tutorials/forced_alignment_tutorial.py index 624037da9d..a10fea4dcc 100644 --- a/examples/tutorials/forced_alignment_tutorial.py +++ b/examples/tutorials/forced_alignment_tutorial.py @@ -42,6 +42,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -106,7 +107,7 @@ model = bundle.get_model().to(device) labels = bundle.get_labels() with torch.inference_mode(): - waveform, _ = torchaudio.load(SPEECH_FILE) + waveform, _ = load_torchcodec(SPEECH_FILE) emissions, _ = model(waveform.to(device)) emissions = torch.log_softmax(emissions, dim=-1) diff --git a/examples/tutorials/hybrid_demucs_tutorial.py b/examples/tutorials/hybrid_demucs_tutorial.py index 081534bfe4..6bb90d9987 100644 --- a/examples/tutorials/hybrid_demucs_tutorial.py +++ b/examples/tutorials/hybrid_demucs_tutorial.py @@ -41,6 +41,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -187,7 +188,7 @@ def plot_spectrogram(stft, title="Spectrogram"): # We download the audio file from our storage. Feel free to download another file and use audio from a specific path SAMPLE_SONG = download_asset("tutorial-assets/hdemucs_mix.wav") -waveform, sample_rate = torchaudio.load(SAMPLE_SONG) # replace SAMPLE_SONG with desired path for different song +waveform, sample_rate = load_torchcodec(SAMPLE_SONG) # replace SAMPLE_SONG with desired path for different song waveform = waveform.to(device) mixture = waveform @@ -267,16 +268,16 @@ def output_results(original_source: torch.Tensor, predicted_source: torch.Tensor other_original = download_asset("tutorial-assets/hdemucs_other_segment.wav") drums_spec = audios["drums"][:, frame_start:frame_end].cpu() -drums, sample_rate = torchaudio.load(drums_original) +drums, sample_rate = load_torchcodec(drums_original) bass_spec = audios["bass"][:, frame_start:frame_end].cpu() -bass, sample_rate = torchaudio.load(bass_original) +bass, sample_rate = load_torchcodec(bass_original) vocals_spec = audios["vocals"][:, frame_start:frame_end].cpu() -vocals, sample_rate = torchaudio.load(vocals_original) +vocals, sample_rate = load_torchcodec(vocals_original) other_spec = audios["other"][:, frame_start:frame_end].cpu() -other, sample_rate = torchaudio.load(other_original) +other, sample_rate = load_torchcodec(other_original) mix_spec = mixture[:, frame_start:frame_end].cpu() diff --git a/examples/tutorials/mvdr_tutorial.py b/examples/tutorials/mvdr_tutorial.py index 442f6234a6..8c9e59dcf6 100644 --- a/examples/tutorials/mvdr_tutorial.py +++ b/examples/tutorials/mvdr_tutorial.py @@ -31,6 +31,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec import torchaudio.functional as F print(torch.__version__) @@ -170,8 +171,8 @@ def evaluate(estimate, reference): # ~~~~~~~~~~~~~~~~~~~~ # -waveform_clean, sr = torchaudio.load(SAMPLE_CLEAN) -waveform_noise, sr2 = torchaudio.load(SAMPLE_NOISE) +waveform_clean, sr = load_torchcodec(SAMPLE_CLEAN) +waveform_noise, sr2 = load_torchcodec(SAMPLE_NOISE) assert sr == sr2 == SAMPLE_RATE # The mixture waveform is a combination of clean and noise waveforms with a desired SNR. target_snr = 3 diff --git a/examples/tutorials/speech_recognition_pipeline_tutorial.py b/examples/tutorials/speech_recognition_pipeline_tutorial.py index 2d815a2e8e..83c7ec0f3b 100644 --- a/examples/tutorials/speech_recognition_pipeline_tutorial.py +++ b/examples/tutorials/speech_recognition_pipeline_tutorial.py @@ -37,6 +37,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -114,7 +115,7 @@ ###################################################################### -# To load data, we use :py:func:`torchaudio.load`. +# To load data, we use :py:func:`load_torchcodec`. # # If the sampling rate is different from what the pipeline expects, then # we can use :py:func:`torchaudio.functional.resample` for resampling. @@ -126,7 +127,7 @@ # using :py:class:`torchaudio.transforms.Resample` might improve the performace. # -waveform, sample_rate = torchaudio.load(SPEECH_FILE) +waveform, sample_rate = load_torchcodec(SPEECH_FILE) waveform = waveform.to(device) if sample_rate != bundle.sample_rate: diff --git a/examples/tutorials/squim_tutorial.py b/examples/tutorials/squim_tutorial.py index 9b9b55ac2e..792f2356d9 100644 --- a/examples/tutorials/squim_tutorial.py +++ b/examples/tutorials/squim_tutorial.py @@ -62,6 +62,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -158,8 +159,8 @@ def plot(waveform, title, sample_rate=16000): # # -WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = torchaudio.load(SAMPLE_SPEECH) -WAVEFORM_NOISE, SAMPLE_RATE_NOISE = torchaudio.load(SAMPLE_NOISE) +WAVEFORM_SPEECH, SAMPLE_RATE_SPEECH = load_torchcodec(SAMPLE_SPEECH) +WAVEFORM_NOISE, SAMPLE_RATE_NOISE = load_torchcodec(SAMPLE_NOISE) WAVEFORM_NOISE = WAVEFORM_NOISE[0:1, :] @@ -328,7 +329,7 @@ def plot(waveform, title, sample_rate=16000): NMR_SPEECH = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav") -WAVEFORM_NMR, SAMPLE_RATE_NMR = torchaudio.load(NMR_SPEECH) +WAVEFORM_NMR, SAMPLE_RATE_NMR = load_torchcodec(NMR_SPEECH) if SAMPLE_RATE_NMR != 16000: WAVEFORM_NMR = F.resample(WAVEFORM_NMR, SAMPLE_RATE_NMR, 16000) diff --git a/examples/tutorials/streamwriter_advanced.py b/examples/tutorials/streamwriter_advanced.py index 37347d1387..29f0efe111 100644 --- a/examples/tutorials/streamwriter_advanced.py +++ b/examples/tutorials/streamwriter_advanced.py @@ -64,6 +64,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -128,7 +129,7 @@ # # Prepare sample audio -waveform, sample_rate = torchaudio.load(AUDIO_PATH, channels_first=False, normalize=False) +waveform, sample_rate = load_torchcodec(AUDIO_PATH, channels_first=False, normalize=False) num_frames, num_channels = waveform.shape ###################################################################### diff --git a/examples/tutorials/streamwriter_basic_tutorial.py b/examples/tutorials/streamwriter_basic_tutorial.py index 35af1a177d..714c4bbadc 100644 --- a/examples/tutorials/streamwriter_basic_tutorial.py +++ b/examples/tutorials/streamwriter_basic_tutorial.py @@ -52,6 +52,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec print(torch.__version__) print(torchaudio.__version__) @@ -74,7 +75,7 @@ from torchaudio.utils import download_asset SAMPLE_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") -WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_PATH, channels_first=False) +WAVEFORM, SAMPLE_RATE = load_torchcodec(SAMPLE_PATH, channels_first=False) NUM_FRAMES, NUM_CHANNELS = WAVEFORM.shape _BASE_DIR = tempfile.TemporaryDirectory() diff --git a/src/torchaudio/datasets/cmuarctic.py b/src/torchaudio/datasets/cmuarctic.py index 96f498f00f..10b2151e43 100644 --- a/src/torchaudio/datasets/cmuarctic.py +++ b/src/torchaudio/datasets/cmuarctic.py @@ -4,6 +4,7 @@ from typing import Tuple, Union import torchaudio +from torchaudio.utils import load_torchcodec from torch import Tensor from torch.utils.data import Dataset from torchaudio._internal import download_url_to_file @@ -43,7 +44,7 @@ def load_cmuarctic_item(line: str, path: str, folder_audio: str, ext_audio: str) file_audio = os.path.join(path, folder_audio, utterance_id + ext_audio) # Load audio - waveform, sample_rate = torchaudio.load(file_audio) + waveform, sample_rate = load_torchcodec(file_audio) return (waveform, sample_rate, transcript, utterance_id.split("_")[1]) diff --git a/src/torchaudio/datasets/commonvoice.py b/src/torchaudio/datasets/commonvoice.py index db0e035c61..d926e22d03 100644 --- a/src/torchaudio/datasets/commonvoice.py +++ b/src/torchaudio/datasets/commonvoice.py @@ -6,6 +6,7 @@ import torchaudio from torch import Tensor from torch.utils.data import Dataset +from torchaudio.utils import load_torchcodec def load_commonvoice_item( @@ -20,7 +21,7 @@ def load_commonvoice_item( filename = os.path.join(path, folder_audio, fileid) if not filename.endswith(ext_audio): filename += ext_audio - waveform, sample_rate = torchaudio.load(filename) + waveform, sample_rate = load_torchcodec(filename) dic = dict(zip(header, line)) diff --git a/src/torchaudio/datasets/dr_vctk.py b/src/torchaudio/datasets/dr_vctk.py index a634b96894..dde5326a8e 100644 --- a/src/torchaudio/datasets/dr_vctk.py +++ b/src/torchaudio/datasets/dr_vctk.py @@ -6,6 +6,7 @@ from torch.utils.data import Dataset from torchaudio._internal import download_url_to_file from torchaudio.datasets.utils import _extract_zip +from torchaudio.utils import load_torchcodec _URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip" @@ -75,8 +76,8 @@ def _load_dr_vctk_item(self, filename: str) -> Tuple[Tensor, int, Tensor, int, s source, channel_id = self._config[filename] file_clean_audio = self._clean_audio_dir / filename file_noisy_audio = self._noisy_audio_dir / filename - waveform_clean, sample_rate_clean = torchaudio.load(file_clean_audio) - waveform_noisy, sample_rate_noisy = torchaudio.load(file_noisy_audio) + waveform_clean, sample_rate_clean = load_torchcodec(file_clean_audio) + waveform_noisy, sample_rate_noisy = load_torchcodec(file_noisy_audio) return ( waveform_clean, sample_rate_clean, diff --git a/src/torchaudio/datasets/gtzan.py b/src/torchaudio/datasets/gtzan.py index 347e7e7183..2fc5e4d357 100644 --- a/src/torchaudio/datasets/gtzan.py +++ b/src/torchaudio/datasets/gtzan.py @@ -7,6 +7,7 @@ from torch.utils.data import Dataset from torchaudio._internal import download_url_to_file from torchaudio.datasets.utils import _extract_tar +from torchaudio.utils import load_torchcodec # The following lists prefixed with `filtered_` provide a filtered split # that: @@ -990,7 +991,7 @@ def load_gtzan_item(fileid: str, path: str, ext_audio: str) -> Tuple[Tensor, str # Read wav file_audio = os.path.join(path, label, fileid + ext_audio) - waveform, sample_rate = torchaudio.load(file_audio) + waveform, sample_rate = load_torchcodec(file_audio) return waveform, sample_rate, label diff --git a/src/torchaudio/datasets/librilight_limited.py b/src/torchaudio/datasets/librilight_limited.py index f0cb3100f7..01dcb99f1f 100644 --- a/src/torchaudio/datasets/librilight_limited.py +++ b/src/torchaudio/datasets/librilight_limited.py @@ -8,6 +8,7 @@ from torchaudio._internal import download_url_to_file from torchaudio.datasets.librispeech import _get_librispeech_metadata from torchaudio.datasets.utils import _extract_tar +from torchaudio.utils import load_torchcodec _ARCHIVE_NAME = "librispeech_finetuning" @@ -104,7 +105,7 @@ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: """ file_path, fileid = self._fileids_paths[n] metadata = _get_librispeech_metadata(fileid, self._path, file_path, self._ext_audio, self._ext_txt) - waveform, _ = torchaudio.load(os.path.join(self._path, metadata[0])) + waveform, _ = load_torchcodec(os.path.join(self._path, metadata[0])) return (waveform,) + metadata[1:] def __len__(self) -> int: diff --git a/src/torchaudio/datasets/libritts.py b/src/torchaudio/datasets/libritts.py index 829ce95729..95a878ce02 100644 --- a/src/torchaudio/datasets/libritts.py +++ b/src/torchaudio/datasets/libritts.py @@ -7,6 +7,7 @@ from torch.utils.data import Dataset from torchaudio._internal import download_url_to_file from torchaudio.datasets.utils import _extract_tar +from torchaudio.utils import load_torchcodec URL = "train-clean-100" FOLDER_IN_ARCHIVE = "LibriTTS" @@ -41,7 +42,7 @@ def load_libritts_item( file_audio = os.path.join(path, speaker_id, chapter_id, file_audio) # Load audio - waveform, sample_rate = torchaudio.load(file_audio) + waveform, sample_rate = load_torchcodec(file_audio) # Load original text with open(original_text) as ft: diff --git a/src/torchaudio/datasets/ljspeech.py b/src/torchaudio/datasets/ljspeech.py index 9cdaeeb0f3..d9a5554cfc 100644 --- a/src/torchaudio/datasets/ljspeech.py +++ b/src/torchaudio/datasets/ljspeech.py @@ -8,7 +8,7 @@ from torch.utils.data import Dataset from torchaudio._internal import download_url_to_file from torchaudio.datasets.utils import _extract_tar - +from torchaudio.utils import load_torchcodec _RELEASE_CONFIGS = { "release1": { @@ -94,7 +94,7 @@ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]: fileid_audio = self._path / (fileid + ".wav") # Load audio - waveform, sample_rate = torchaudio.load(fileid_audio) + waveform, sample_rate = load_torchcodec(fileid_audio) return ( waveform, diff --git a/src/torchaudio/datasets/musdb_hq.py b/src/torchaudio/datasets/musdb_hq.py index dd4bc9f340..a74de61370 100644 --- a/src/torchaudio/datasets/musdb_hq.py +++ b/src/torchaudio/datasets/musdb_hq.py @@ -7,6 +7,7 @@ from torch.utils.data import Dataset from torchaudio._internal import download_url_to_file from torchaudio.datasets.utils import _extract_zip +from torchaudio.utils import load_torchcodec _URL = "https://zenodo.org/record/3338373/files/musdb18hq.zip" _CHECKSUM = "baac80d0483c61d74b2e5f3be75fa557eec52898339e6aa45c1fa48833c5d21d" @@ -87,7 +88,7 @@ def _load_sample(self, n: int) -> Tuple[torch.Tensor, int, int, str]: num_frames = None for source in self.sources: track = self._get_track(name, source) - wav, sr = torchaudio.load(str(track)) + wav, sr = load_torchcodec(str(track)) if sr != _SAMPLE_RATE: raise ValueError(f"expected sample rate {_SAMPLE_RATE}, but got {sr}") if num_frames is None: diff --git a/src/torchaudio/datasets/tedlium.py b/src/torchaudio/datasets/tedlium.py index 7e7d22195a..3c7182100b 100644 --- a/src/torchaudio/datasets/tedlium.py +++ b/src/torchaudio/datasets/tedlium.py @@ -7,6 +7,7 @@ from torch.utils.data import Dataset from torchaudio._internal import download_url_to_file from torchaudio.datasets.utils import _extract_tar +from torchaudio.utils import load_torchcodec _RELEASE_CONFIGS = { @@ -163,12 +164,7 @@ def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate Returns: [Tensor, int]: Audio tensor representation and sample rate """ - start_time = int(float(start_time) * sample_rate) - end_time = int(float(end_time) * sample_rate) - - kwargs = {"frame_offset": start_time, "num_frames": end_time - start_time} - - return torchaudio.load(path, **kwargs) + return load_torchcodec(path, start_seconds=float(start_time), stop_seconds=float(end_time)) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: """Load the n-th sample from the dataset. diff --git a/src/torchaudio/datasets/utils.py b/src/torchaudio/datasets/utils.py index b4599f83aa..2952510eab 100644 --- a/src/torchaudio/datasets/utils.py +++ b/src/torchaudio/datasets/utils.py @@ -3,6 +3,7 @@ import tarfile import zipfile from typing import Any, List, Optional +from torchaudio.utils import load_torchcodec import torchaudio @@ -48,7 +49,7 @@ def _load_waveform( exp_sample_rate: int, ): path = os.path.join(root, filename) - waveform, sample_rate = torchaudio.load(path) + waveform, sample_rate = load_torchcodec(path) if exp_sample_rate != sample_rate: raise ValueError(f"sample rate should be {exp_sample_rate}, but got {sample_rate}") return waveform diff --git a/src/torchaudio/datasets/vctk.py b/src/torchaudio/datasets/vctk.py index 3195b9b427..4879c5274e 100644 --- a/src/torchaudio/datasets/vctk.py +++ b/src/torchaudio/datasets/vctk.py @@ -6,6 +6,7 @@ from torch.utils.data import Dataset from torchaudio._internal import download_url_to_file from torchaudio.datasets.utils import _extract_zip +from torchaudio.utils import load_torchcodec URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip" _CHECKSUMS = { @@ -98,7 +99,7 @@ def _load_text(self, file_path) -> str: return file_path.readlines()[0] def _load_audio(self, file_path) -> Tuple[Tensor, int]: - return torchaudio.load(file_path) + return load_torchcodec(file_path) def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> SampleType: transcript_path = os.path.join(self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt") diff --git a/src/torchaudio/datasets/yesno.py b/src/torchaudio/datasets/yesno.py index baad08f159..ba42775be8 100644 --- a/src/torchaudio/datasets/yesno.py +++ b/src/torchaudio/datasets/yesno.py @@ -7,7 +7,7 @@ from torch.utils.data import Dataset from torchaudio._internal import download_url_to_file from torchaudio.datasets.utils import _extract_tar - +from torchaudio.utils import load_torchcodec _RELEASE_CONFIGS = { "release1": { @@ -62,7 +62,7 @@ def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, downloa def _load_item(self, fileid: str, path: str): labels = [int(c) for c in fileid.split("_")] file_audio = os.path.join(path, fileid + ".wav") - waveform, sample_rate = torchaudio.load(file_audio) + waveform, sample_rate = load_torchcodec(file_audio) return waveform, sample_rate, labels def __getitem__(self, n: int) -> Tuple[Tensor, int, List[int]]: diff --git a/src/torchaudio/models/wav2vec2/utils/import_fairseq.py b/src/torchaudio/models/wav2vec2/utils/import_fairseq.py index 39791e9b7d..d255730e53 100644 --- a/src/torchaudio/models/wav2vec2/utils/import_fairseq.py +++ b/src/torchaudio/models/wav2vec2/utils/import_fairseq.py @@ -140,7 +140,7 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model: Example - Loading pretrain-only model >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model - >>> + >>> from torchaudio.utils import load_torchcodec >>> # Load model using fairseq >>> model_file = 'wav2vec_small.pt' >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file]) @@ -148,7 +148,7 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model: >>> imported = import_fairseq_model(original) >>> >>> # Perform feature extraction - >>> waveform, _ = torchaudio.load('audio.wav') + >>> waveform, _ = load_torchcodec('audio.wav') >>> features, _ = imported.extract_features(waveform) >>> >>> # Compare result with the original model from fairseq @@ -157,7 +157,7 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model: Example - Fine-tuned model >>> from torchaudio.models.wav2vec2.utils import import_fairseq_model - >>> + >>> from torchaudio.utils import load_torchcodec >>> # Load model using fairseq >>> model_file = 'wav2vec_small_960h.pt' >>> model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file]) @@ -165,7 +165,7 @@ def import_fairseq_model(original: Module) -> Wav2Vec2Model: >>> imported = import_fairseq_model(original.w2v_encoder) >>> >>> # Perform encoding - >>> waveform, _ = torchaudio.load('audio.wav') + >>> waveform, _ = load_torchcodec('audio.wav') >>> emission, _ = imported(waveform) >>> >>> # Compare result with the original model from fairseq diff --git a/src/torchaudio/models/wav2vec2/utils/import_huggingface.py b/src/torchaudio/models/wav2vec2/utils/import_huggingface.py index 519d8c919f..7187536d25 100644 --- a/src/torchaudio/models/wav2vec2/utils/import_huggingface.py +++ b/src/torchaudio/models/wav2vec2/utils/import_huggingface.py @@ -117,8 +117,8 @@ def import_huggingface_model(original: Module) -> Wav2Vec2Model: >>> >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") >>> model = import_huggingface_model(original) - >>> - >>> waveforms, _ = torchaudio.load("audio.wav") + >>> from torchaudio.utils import load_torchcodec + >>> waveforms, _ = load_torchcodec("audio.wav") >>> logits, _ = model(waveforms) """ _LG.info("Importing model.") diff --git a/src/torchaudio/models/wavernn.py b/src/torchaudio/models/wavernn.py index 8ae5a3e916..c2367ed96b 100644 --- a/src/torchaudio/models/wavernn.py +++ b/src/torchaudio/models/wavernn.py @@ -222,7 +222,8 @@ class WaveRNN(nn.Module): Example >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200) - >>> waveform, sample_rate = torchaudio.load(file) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec(file) >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length) >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time) >>> output = wavernn(waveform, specgram) diff --git a/src/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py b/src/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py index 0ae812f920..b23db4c9fc 100644 --- a/src/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +++ b/src/torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py @@ -22,12 +22,12 @@ class VGGishBundle: Example: >>> import torchaudio >>> from torchaudio.prototype.pipelines import VGGISH - >>> + >>> from torchaudio.utils import load_torchcodec >>> input_sr = VGGISH.sample_rate >>> input_proc = VGGISH.get_input_processor() >>> model = VGGISH.get_model() >>> - >>> waveform, sr = torchaudio.load( + >>> waveform, sr = load_torchcodec( >>> "Chopin_Ballade_-1_In_G_Minor,_Op._23.mp3", >>> ) >>> waveform = waveform.squeeze(0) diff --git a/src/torchaudio/prototype/transforms/_transforms.py b/src/torchaudio/prototype/transforms/_transforms.py index 3390b3a583..88930c38b3 100644 --- a/src/torchaudio/prototype/transforms/_transforms.py +++ b/src/torchaudio/prototype/transforms/_transforms.py @@ -24,7 +24,8 @@ class BarkScale(torch.nn.Module): bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024) >>> spectrogram = spectrogram_transform(waveform) >>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1) @@ -95,7 +96,8 @@ class InverseBarkScale(torch.nn.Module): bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024) >>> mel_spectrogram = bark_spectrogram_transform(waveform) >>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1) @@ -230,7 +232,8 @@ class BarkSpectrogram(torch.nn.Module): bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.BarkSpectrogram(sample_rate) >>> bark_specgram = transform(waveform) # (channel, n_barks, time) @@ -320,7 +323,8 @@ class ChromaScale(torch.nn.Module): base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024) >>> spectrogram = spectrogram_transform(waveform) >>> chroma_transform = transforms.ChromaScale(sample_rate=sample_rate, n_freqs=1024 // 2 + 1) @@ -397,7 +401,8 @@ class ChromaSpectrogram(torch.nn.Module): base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.ChromaSpectrogram(sample_rate=sample_rate, n_fft=400) >>> chromagram = transform(waveform) # (channel, n_chroma, time) """ diff --git a/src/torchaudio/sox_effects/sox_effects.py b/src/torchaudio/sox_effects/sox_effects.py index 256c461edc..b50925c2c2 100644 --- a/src/torchaudio/sox_effects/sox_effects.py +++ b/src/torchaudio/sox_effects/sox_effects.py @@ -151,7 +151,8 @@ def apply_effects_tensor( >>> transform = torch.jit.load(path) >>> >>>> # Run transform - >>> waveform, input_sample_rate = torchaudio.load("input.wav") + >>> from torchaudio.utils import load_torchcodec + >>> waveform, input_sample_rate = load_torchcodec("input.wav") >>> waveform, sample_rate = transform(waveform, input_sample_rate) >>> assert sample_rate == 8000 """ diff --git a/src/torchaudio/transforms/_transforms.py b/src/torchaudio/transforms/_transforms.py index 5bf914bc12..deeb7e0928 100644 --- a/src/torchaudio/transforms/_transforms.py +++ b/src/torchaudio/transforms/_transforms.py @@ -54,7 +54,8 @@ class Spectrogram(torch.nn.Module): Deprecated and not used. Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = torchaudio.transforms.Spectrogram(n_fft=800) >>> spectrogram = transform(waveform) @@ -315,7 +316,8 @@ class AmplitudeToDB(torch.nn.Module): number is 80. (Default: ``None``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.AmplitudeToDB(stype="amplitude", top_db=80) >>> waveform_db = transform(waveform) """ @@ -364,7 +366,8 @@ class MelScale(torch.nn.Module): mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024) >>> spectrogram = spectrogram_transform(waveform) >>> melscale_transform = transforms.MelScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1) @@ -438,7 +441,8 @@ class InverseMelScale(torch.nn.Module): (Default: ``"gels``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> mel_spectrogram_transform = transforms.MelSpectrogram(sample_rate, n_fft=1024) >>> mel_spectrogram = mel_spectrogram_transform(waveform) >>> inverse_melscale_transform = transforms.InverseMelScale(n_stft=1024 // 2 + 1) @@ -544,7 +548,8 @@ class MelSpectrogram(torch.nn.Module): mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.MelSpectrogram(sample_rate) >>> mel_specgram = transform(waveform) # (channel, n_mels, time) @@ -646,7 +651,8 @@ class MFCC(torch.nn.Module): melkwargs (dict or None, optional): arguments for MelSpectrogram. (Default: ``None``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.MFCC( >>> sample_rate=sample_rate, >>> n_mfcc=13, @@ -736,7 +742,8 @@ class LFCC(torch.nn.Module): speckwargs (dict or None, optional): arguments for Spectrogram. (Default: ``None``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.LFCC( >>> sample_rate=sample_rate, >>> n_lfcc=13, @@ -836,7 +843,8 @@ class MuLawEncoding(torch.nn.Module): quantization_channels (int, optional): Number of channels. (Default: ``256``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = torchaudio.transforms.MuLawEncoding(quantization_channels=512) >>> mulawtrans = transform(waveform) @@ -875,7 +883,8 @@ class MuLawDecoding(torch.nn.Module): quantization_channels (int, optional): Number of channels. (Default: ``256``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = torchaudio.transforms.MuLawDecoding(quantization_channels=512) >>> mulawtrans = transform(waveform) """ @@ -928,7 +937,8 @@ class Resample(torch.nn.Module): carried out on ``torch.float64``. Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.Resample(sample_rate, sample_rate/10) >>> waveform = transform(waveform) """ @@ -1098,7 +1108,8 @@ class Fade(torch.nn.Module): (Default: ``"linear"``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.Fade(fade_in_len=sample_rate, fade_out_len=2 * sample_rate, fade_shape="linear") >>> faded_waveform = transform(waveform) """ @@ -1359,7 +1370,9 @@ class Loudness(torch.nn.Module): sample_rate (int): Sample rate of audio signal. Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.Loudness(sample_rate) >>> loudness = transform(waveform) @@ -1398,7 +1411,9 @@ class Vol(torch.nn.Module): gain_type (str, optional): Type of gain. One of: ``amplitude``, ``power``, ``db`` (Default: ``amplitude``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.Vol(gain=0.5, gain_type="amplitude") >>> quieter_waveform = transform(waveform) """ @@ -1448,7 +1463,9 @@ class SlidingWindowCmn(torch.nn.Module): norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.SlidingWindowCmn(cmn_window=1000) >>> cmn_waveform = transform(waveform) """ @@ -1528,7 +1545,9 @@ class Vad(torch.nn.Module): in the detector algorithm. (Default: 2000.0) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> waveform_reversed, sample_rate = apply_effects_tensor(waveform, sample_rate, [["reverse"]]) >>> transform = transforms.Vad(sample_rate=sample_rate, trigger_level=7.5) >>> waveform_reversed_front_trim = transform(waveform_reversed) @@ -1631,7 +1650,9 @@ class SpectralCentroid(torch.nn.Module): wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``) Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.SpectralCentroid(sample_rate) >>> spectral_centroid = transform(waveform) # (channel, time) """ @@ -1690,7 +1711,9 @@ class PitchShift(LazyModuleMixin, torch.nn.Module): If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``). Example - >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True) + >>> from torchaudio.utils import load_torchcodec + >>> + >>> waveform, sample_rate = load_torchcodec("test.wav", normalize=True) >>> transform = transforms.PitchShift(sample_rate, 4) >>> waveform_shift = transform(waveform) # (channel, time) """ diff --git a/src/torchaudio/utils/__init__.py b/src/torchaudio/utils/__init__.py index 89bffaa34d..b4c76baf6b 100644 --- a/src/torchaudio/utils/__init__.py +++ b/src/torchaudio/utils/__init__.py @@ -2,9 +2,22 @@ from . import sox_utils from .download import download_asset +import os +def load_torchcodec(file, normalize=True, channels_first=True, start_seconds=0.0, stop_seconds=None, **args): + if not normalize: + raise Exception("Torchcodec does not support non-normalized file reading") + try: + from torchcodec.decoders import AudioDecoder + except: + raise Exception("To use this feature, you must install torchcodec. See https://github.com/pytorch/torchcodec for installation instructions") + decoder = AudioDecoder(file, **args) + samples = decoder.get_samples_played_in_range(start_seconds, stop_seconds) + data = samples.data if channels_first else samples.data.T + return (data, samples.sample_rate) __all__ = [ + "load_torchcodec", "download_asset", "sox_utils", "ffmpeg_utils", diff --git a/src/torchaudio/utils/ffmpeg_utils.py b/src/torchaudio/utils/ffmpeg_utils.py index 385596edc1..04358a0494 100644 --- a/src/torchaudio/utils/ffmpeg_utils.py +++ b/src/torchaudio/utils/ffmpeg_utils.py @@ -1,6 +1,6 @@ """Module to change the configuration of FFmpeg libraries (such as libavformat). -It affects functionalities in :py:mod:`torchaudio.io` (and indirectly :py:func:`torchaudio.load`). +It affects functionalities in :py:mod:`torchaudio.io` (and indirectly :py:func:`load_torchcodec`). """ diff --git a/test/integration_tests/loudness_compliance_test.py b/test/integration_tests/loudness_compliance_test.py index d9473cfa50..3c28affb54 100644 --- a/test/integration_tests/loudness_compliance_test.py +++ b/test/integration_tests/loudness_compliance_test.py @@ -5,6 +5,7 @@ import torch import torchaudio +from torchaudio.utils import load_torchcodec import torchaudio.functional as F @@ -40,7 +41,7 @@ def test_loudness(tmp_path, filename, url, expected): with zipfile.ZipFile(zippath) as file: file.extractall(zippath.parent) - waveform, sample_rate = torchaudio.load(zippath.with_suffix(".wav")) + waveform, sample_rate = load_torchcodec(zippath.with_suffix(".wav")) loudness = F.loudness(waveform, sample_rate) expected = torch.tensor(expected, dtype=loudness.dtype, device=loudness.device) assert torch.allclose(loudness, expected, rtol=0.01, atol=0.1) diff --git a/test/integration_tests/prototype/vggish_pipeline_test.py b/test/integration_tests/prototype/vggish_pipeline_test.py index 72c6e1e518..25a27b7e10 100644 --- a/test/integration_tests/prototype/vggish_pipeline_test.py +++ b/test/integration_tests/prototype/vggish_pipeline_test.py @@ -1,4 +1,5 @@ import torchaudio +from torchaudio.utils import load_torchcodec from torchaudio.prototype.pipelines import VGGISH @@ -7,7 +8,7 @@ def test_vggish(): input_proc = VGGISH.get_input_processor() model = VGGISH.get_model() path = torchaudio.utils.download_asset("test-assets/Chopin_Ballade_-1_In_G_Minor,_Op._23_excerpt.mp3") - waveform, sr = torchaudio.load(path, backend="ffmpeg") + waveform, sr = load_torchcodec(path, backend="ffmpeg") waveform = waveform.mean(axis=0) waveform = torchaudio.functional.resample(waveform, sr, input_sr) batch = input_proc(waveform) diff --git a/test/integration_tests/rnnt_pipeline_test.py b/test/integration_tests/rnnt_pipeline_test.py index 6827d27d46..fbcce60f6d 100644 --- a/test/integration_tests/rnnt_pipeline_test.py +++ b/test/integration_tests/rnnt_pipeline_test.py @@ -1,5 +1,6 @@ import pytest import torchaudio +from torchaudio.utils import load_torchcodec from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH from torchaudio.prototype.pipelines import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3 @@ -16,7 +17,7 @@ def test_rnnt(bundle, sample_speech, expected): feature_extractor = bundle.get_feature_extractor() decoder = bundle.get_decoder().eval() token_processor = bundle.get_token_processor() - waveform, _ = torchaudio.load(sample_speech) + waveform, _ = load_torchcodec(sample_speech) features, length = feature_extractor(waveform.squeeze()) hypotheses = decoder(features, length, 10) text = token_processor(hypotheses[0][0]) diff --git a/test/integration_tests/source_separation_pipeline_test.py b/test/integration_tests/source_separation_pipeline_test.py index 7507958400..c56683dcc0 100644 --- a/test/integration_tests/source_separation_pipeline_test.py +++ b/test/integration_tests/source_separation_pipeline_test.py @@ -4,6 +4,7 @@ import pytest import torch import torchaudio +from torchaudio.utils import load_torchcodec from torchaudio.pipelines import CONVTASNET_BASE_LIBRI2MIX, HDEMUCS_HIGH_MUSDB, HDEMUCS_HIGH_MUSDB_PLUS @@ -27,11 +28,11 @@ def test_source_separation_models(bundle, task, channel, expected_score, mixture Si-SDR score should be equal to or larger than the expected score. """ model = bundle.get_model() - mixture_waveform, sample_rate = torchaudio.load(mixture_source) + mixture_waveform, sample_rate = load_torchcodec(mixture_source) assert sample_rate == bundle.sample_rate, "The sample rate of audio must match that in the bundle." clean_waveforms = [] for source in clean_sources: - clean_waveform, sample_rate = torchaudio.load(source) + clean_waveform, sample_rate = load_torchcodec(source) assert sample_rate == bundle.sample_rate, "The sample rate of audio must match that in the bundle." clean_waveforms.append(clean_waveform) mixture_waveform = mixture_waveform.reshape(1, channel, -1) diff --git a/test/integration_tests/squim_pipeline_test.py b/test/integration_tests/squim_pipeline_test.py index 9f78bba4d4..c8b21a14d5 100644 --- a/test/integration_tests/squim_pipeline_test.py +++ b/test/integration_tests/squim_pipeline_test.py @@ -1,5 +1,6 @@ import pytest import torchaudio +from torchaudio.utils import load_torchcodec from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE @@ -16,7 +17,7 @@ def test_squim_objective_pretrained_weights(lang, expected, sample_speech): # Get SquimObjective model model = bundle.get_model() # Create a synthetic waveform - waveform, sample_rate = torchaudio.load(sample_speech) + waveform, sample_rate = load_torchcodec(sample_speech) scores = model(waveform) for i in range(3): assert abs(scores[i].item() - expected[i]) < 1e-5 @@ -35,9 +36,9 @@ def test_squim_subjective_pretrained_weights(task, expected, mixture_source, cle # Get SquimObjective model model = bundle.get_model() # Load input mixture audio - waveform, sample_rate = torchaudio.load(mixture_source) + waveform, sample_rate = load_torchcodec(mixture_source) for i, source in enumerate(clean_sources): # Load clean reference - clean_waveform, sample_rate = torchaudio.load(source) + clean_waveform, sample_rate = load_torchcodec(source) score = model(waveform, clean_waveform) assert abs(score.item() - expected[i]) < 1e-5 diff --git a/test/integration_tests/wav2vec2_pipeline_test.py b/test/integration_tests/wav2vec2_pipeline_test.py index c863ea3688..a6489169b1 100644 --- a/test/integration_tests/wav2vec2_pipeline_test.py +++ b/test/integration_tests/wav2vec2_pipeline_test.py @@ -2,6 +2,7 @@ import pytest import torchaudio +from torchaudio.utils import load_torchcodec from torchaudio.pipelines import ( HUBERT_ASR_LARGE, HUBERT_ASR_XLARGE, @@ -113,7 +114,7 @@ def test_finetune_asr_model( ): """Smoke test of downloading weights for fine-tuning models and simple transcription""" model = bundle.get_model().eval() - waveform, sample_rate = torchaudio.load(sample_speech) + waveform, sample_rate = load_torchcodec(sample_speech) emission, _ = model(waveform) decoder = ctc_decoder(bundle.get_labels()) result = decoder(emission[0]) diff --git a/test/torchaudio_unittest/conftest.py b/test/torchaudio_unittest/conftest.py new file mode 100644 index 0000000000..0a20827ade --- /dev/null +++ b/test/torchaudio_unittest/conftest.py @@ -0,0 +1,14 @@ +import pytest +import os + + +def pytest_collection_modifyitems(config, items): + fail_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "ffmpeg_fail_ids.txt") + with open(fail_path, 'r') as file: + fail_ids = set([f.strip() for f in file.readlines()]) + + skip_marker = pytest.mark.skip(reason="FFMPEG incompatible with CI runner") + + for item in items: + if item.nodeid in fail_ids: + item.add_marker(skip_marker) diff --git a/test/torchaudio_unittest/ffmpeg_fail_ids.txt b/test/torchaudio_unittest/ffmpeg_fail_ids.txt new file mode 100644 index 0000000000..50bd062384 --- /dev/null +++ b/test/torchaudio_unittest/ffmpeg_fail_ids.txt @@ -0,0 +1,228 @@ +test/torchaudio_unittest/datasets/cmuarctic_test.py::TestCMUARCTIC::test_cmuarctic_path +test/torchaudio_unittest/datasets/cmuarctic_test.py::TestCMUARCTIC::test_cmuarctic_str +test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceN::test_commonvoice_path +test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceN::test_commonvoice_str +test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceFR::test_commonvoice_str +test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_test_path +test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_test_str +test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_train_path +test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_train_str +test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsTest +test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsTrain +test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsValid +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_no_subset +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_testing_path +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_testing_str +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_training_path +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_training_str +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_validation_path +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_validation_str +test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPFullDataset +test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPImprovisedDataset +test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPScriptedDataset +test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_10h +test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_10min +test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_1h +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_0_sep_clean +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_1_enh_single +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_2_enh_both +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_3_sep_noisy +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_0_sep_clean +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_1_enh_single +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_2_enh_both +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_3_sep_noisy +test/torchaudio_unittest/datasets/librispeech_test.py::TestLibriSpeech::test_librispeech_path +test/torchaudio_unittest/datasets/librispeech_test.py::TestLibriSpeech::test_librispeech_str +test/torchaudio_unittest/datasets/libritts_test.py::TestLibriTTS::test_libritts_path +test/torchaudio_unittest/datasets/libritts_test.py::TestLibriTTS::test_libritts_str +test/torchaudio_unittest/datasets/ljspeech_test.py::TestLJSpeech::test_ljspeech_path +test/torchaudio_unittest/datasets/ljspeech_test.py::TestLJSpeech::test_ljspeech_str +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_0 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_1 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_2 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_3 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_4 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_5 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_6 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_0 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_1 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_2 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_3 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_4 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_5 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_6 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_0 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_1 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_2 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_3 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_4 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_5 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_6 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_0 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_1 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_2 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_3 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_4 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_5 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_6 +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_0_albanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_1_basque +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_2_czech +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_3_nnenglish +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_4_romanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_5_slovak +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_0_albanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_1_basque +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_2_czech +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_3_nnenglish +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_4_romanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_5_slovak +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_0_albanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_1_basque +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_2_czech +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_3_nnenglish +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_4_romanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_5_slovak +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14SubsetDev +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14SubsetDocs +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14Subsetval +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14SubsetEval +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_5_slovak +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_4_romanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_3_nnenglish +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_2_czech +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_1_basque +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14EvalSingleLanguage_0_albanian +test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceEN::test_commonvoice_path +test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceEN::test_commonvoice_str +test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIEMOCAPFullDataset +test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIEMOCAPImprovisedDataset +test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIEMOCAPScriptedDataset +test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsTest +test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsTrain +test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsValid +test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetTest +test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetTrain +test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetValid +test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommands_path +test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommands_str +test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release1_path +test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release1_str +test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release2 +test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release3 +test/torchaudio_unittest/datasets/vctk_test.py::TestVCTK::test_vctk_path +test/torchaudio_unittest/datasets/vctk_test.py::TestVCTK::test_vctk_str +test/torchaudio_unittest/datasets/voxceleb1_test.py::TestVoxCeleb1Identification::testVoxCeleb1SubsetTrain +test/torchaudio_unittest/datasets/voxceleb1_test.py::TestVoxCeleb1Verification::testVoxCeleb1Verification +test/torchaudio_unittest/datasets/yesno_test.py::TestYesNo::test_yesno_path +test/torchaudio_unittest/datasets/yesno_test.py::TestYesNo::test_yesno_str +test/torchaudio_unittest/example/souce_sepration/wsj0mix_test.py::TestWSJ0Mix2::test_wsj0mix +test/torchaudio_unittest/example/souce_sepration/wsj0mix_test.py::TestWSJ0Mix3::test_wsj0mix +test/torchaudio_unittest/datasets/cmuarctic_test.py::TestCMUARCTIC::test_cmuarctic_path +test/torchaudio_unittest/datasets/cmuarctic_test.py::TestCMUARCTIC::test_cmuarctic_str +test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceN::test_commonvoice_path +test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceN::test_commonvoice_str +test/torchaudio_unittest/datasets/commonvoice_test.py::TestCommonVoiceFR::test_commonvoice_str +test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_test_path +test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_test_str +test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_train_path +test/torchaudio_unittest/datasets/dr_vctk_test.py::TestDRVCTK::test_dr_vctk_train_str +test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsTest +test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsTrain +test/torchaudio_unittest/datasets/fluentcommands_test.py::TestFluentSpeechCommands::testFluentCommandsValid +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_no_subset +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_testing_path +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_testing_str +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_training_path +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_training_str +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_validation_path +test/torchaudio_unittest/datasets/gtzan_test.py::TestGTZAN::test_validation_str +test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPFullDataset +test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPImprovisedDataset +test/torchaudio_unittest/datasets/iemocap_test.py::TestIemocap::testIMOCAPScriptedDataset +test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_10h +test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_10min +test/torchaudio_unittest/datasets/librilightlimited_test.py::TestLibriLightLimited::test_librilightlimited_1h +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_0_sep_clean +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_1_enh_single +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_2_enh_both +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_2speaker_3_sep_noisy +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_0_sep_clean +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_1_enh_single +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_2_enh_both +test/torchaudio_unittest/datasets/librimix_test.py::TestLibriMix::test_librimix_3speaker_3_sep_noisy +test/torchaudio_unittest/datasets/librispeech_test.py::TestLibriSpeech::test_librispeech_path +test/torchaudio_unittest/datasets/librispeech_test.py::TestLibriSpeech::test_librispeech_str +test/torchaudio_unittest/datasets/libritts_test.py::TestLibriTTS::test_libritts_path +test/torchaudio_unittest/datasets/libritts_test.py::TestLibriTTS::test_libritts_str +test/torchaudio_unittest/datasets/ljspeech_test.py::TestLJSpeech::test_ljspeech_path +test/torchaudio_unittest/datasets/ljspeech_test.py::TestLJSpeech::test_ljspeech_str +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_0 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_1 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_2 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_3 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_4 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_5 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_test_6 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_0 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_1 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_2 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_3 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_4 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_5 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_all_6 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_0 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_1 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_2 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_3 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_4 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_5 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_train_with_validation_6 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_0 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_1 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_2 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_3 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_4 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_5 +test/torchaudio_unittest/datasets/musdb_hq_test.py::TestMusDB_HQ::testMusDBSources_validation_6 +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_0_albanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_1_basque +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_2_czech +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_3_nnenglish +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_4_romanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DevSingleLanguage_5_slovak +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_0_albanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_1_basque +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_2_czech +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_3_nnenglish +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_4_romanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14DocsSingleLanguage_5_slovak +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_0_albanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_1_basque +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_2_czech +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_3_nnenglish +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_4_romanian +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14valSingleLanguage_5_slovak +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14SubsetDev +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14SubsetDocs +test/torchaudio_unittest/datasets/quesst14_test.py::TestQuesst14::testQuesst14Subsetval +test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsTest +test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsTrain +test/torchaudio_unittest/datasets/snips_test.py::TestSnips::testSnipsValid +test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetTest +test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetTrain +test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommandsSubsetValid +test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommands_path +test/torchaudio_unittest/datasets/speechcommands_test.py::TestSpeechCommands::testSpeechCommands_str +test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release1_path +test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release1_str +test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release2 +test/torchaudio_unittest/datasets/tedlium_test.py::Tedlium::test_tedlium_release3 +test/torchaudio_unittest/datasets/vctk_test.py::TestVCTK::test_vctk_path +test/torchaudio_unittest/datasets/vctk_test.py::TestVCTK::test_vctk_str +test/torchaudio_unittest/datasets/voxceleb1_test.py::TestVoxCeleb1Identification::testVoxCeleb1SubsetTrain +test/torchaudio_unittest/datasets/voxceleb1_test.py::TestVoxCeleb1Verification::testVoxCeleb1Verification +test/torchaudio_unittest/datasets/yesno_test.py::TestYesNo::test_yesno_path +test/torchaudio_unittest/datasets/yesno_test.py::TestYesNo::test_yesno_str +test/torchaudio_unittest/example/souce_sepration/wsj0mix_test.py::TestWSJ0Mix2::test_wsj0mix +test/torchaudio_unittest/example/souce_sepration/wsj0mix_test.py::TestWSJ0Mix3::test_wsj0mix