pytorch · samanklesaria · Jul 8, 2025 · Jul 9, 2025 · Jul 9, 2025 · Jul 10, 2025
@@ -74,7 +74,7 @@ case $GPU_ARCH_TYPE in
     ;;
 esac
 PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}"
-pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}"
+pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}"
 
 
 # 2. Install torchaudio
@@ -85,6 +85,9 @@ export BUILD_CPP_TEST=1
 python setup.py install
 
 # 3. Install Test tools
+conda install -y "ffmpeg<5"
+python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)"
+
 printf "* Installing test tools\n"
 NUMBA_DEV_CHANNEL=""
 if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then
@@ -94,7 +97,7 @@ if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then
 fi
 (
     set -x
-    conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} sox libvorbis parameterized 'requests>=2.20' 'ffmpeg>=6,<7'
+    conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} sox libvorbis parameterized 'requests>=2.20' 
     pip install kaldi-io SoundFile librosa coverage pytest pytest-cov scipy expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm
 
     # TODO: might be better to fix the single call to `pip install` above

@@ -68,7 +68,7 @@ jobs:
 
         GPU_ARCH_ID=cu126  # This is hard-coded and must be consistent with gpu-arch-version.
         PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}"
-        pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}"
+        pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}"
 
         echo "::endgroup::"
         echo "::group::Install TorchAudio"

@@ -1,6 +1,7 @@
 Jinja2<3.1.0
 matplotlib<=3.8
 pyparsing<3,>=2.0.2
+torchcodec
 
 # C++ docs
 breathe==4.34.0

@@ -182,7 +182,7 @@ Tutorials
 
 .. customcarditem::
    :header: Loading waveform Tensors from files and saving them
-   :card_description: Learn how to query/load audio files and save waveform tensors to files, using <code>torchaudio.info</code>, <code>torchaudio.load</code> and <code>torchaudio.save</code> functions.
+   :card_description: Learn how to query/load audio files and save waveform tensors to files, using <code>torchaudio.info</code>, <code>torchaudio.utils.load_torchcodec</code> and <code>torchaudio.save</code> functions.
    :image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/audio_io_tutorial.png
    :link: tutorials/audio_io_tutorial.html
    :tags: I/O
@@ -399,7 +399,7 @@ In BibTeX format:
 .. code-block:: bibtex
 
    @misc{hwang2023torchaudio,
-      title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch}, 
+      title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
       author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
       year={2023},
       eprint={2310.17864},

@@ -4,6 +4,7 @@
 import torch
 import torchaudio
 import yaml
+from torchaudio.utils import load_torchcodec
 
 
 FOLDER_IN_ARCHIVE = "en-de"
@@ -31,15 +32,15 @@ def __init__(
         self.idx_target_lengths = []
         self.wav_list = []
         for idx, item in enumerate(file_list):
-            offset = int(item["offset"] * SAMPLE_RATE)
-            duration = int(item["duration"] * SAMPLE_RATE)
+            offset = item["offset"]
+            duration = item["duration"]
             self.idx_target_lengths.append((idx, item["duration"]))
             file_path = wav_dir / item["wav"]
             self.wav_list.append((file_path, offset, duration))
 
     def _get_mustc_item(self, idx):
         file_path, offset, duration = self.wav_list[idx]
-        waveform, sr = torchaudio.load(file_path, frame_offset=offset, num_frames=duration)
+        waveform, sr = load_torchcodec(file_path, start_seconds=offset, stop_seconds=offset + duration)
         assert sr == SAMPLE_RATE
         transcript = self.trans_list[idx].replace("\n", "")
         return (waveform, transcript)

diff --git a/examples/avsr/data_prep/data/data_module.py b/examples/avsr/data_prep/data/data_module.py
@@ -7,7 +7,7 @@
 import torch
 import torchaudio
 import torchvision
-
+from torchaudio.utils import load_torchcodec
 
 class AVSRDataLoader:
     def __init__(self, modality, detector="retinaface", resize=None):
@@ -39,7 +39,7 @@ def load_data(self, data_filename, transform=True):
             return video
 
     def load_audio(self, data_filename):
-        waveform, sample_rate = torchaudio.load(data_filename, normalize=True)
+        waveform, sample_rate = load_torchcodec(data_filename, normalize=True)
         return waveform, sample_rate
 
     def load_video(self, data_filename):

diff --git a/examples/avsr/lrs3.py b/examples/avsr/lrs3.py
@@ -3,6 +3,7 @@
 import torchaudio
 import torchvision
 from torch.utils.data import Dataset
+from torchaudio.utils import load_torchcodec
 
 
 def _load_list(args, *filenames):
@@ -31,7 +32,7 @@ def load_audio(path):
     """
     rtype: torch, T x 1
     """
-    waveform, sample_rate = torchaudio.load(path, normalize=True)
+    waveform, sample_rate = load_torchcodec(path, normalize=True)
     return waveform.transpose(1, 0)
 
 

@@ -8,6 +8,7 @@
 from torch import Tensor
 from torch.utils.data import Dataset
 from utils import CollateFnL3DAS22
+from torchaudio.utils import load_torchcodec
 
 _PREFIX = "L3DAS22_Task1_"
 _SUBSETS = {
@@ -46,10 +47,10 @@ def __getitem__(self, n: int) -> Tuple[Tensor, Tensor, int, str]:
         noisy_path_B = str(noisy_path_A).replace("_A.wav", "_B.wav")
         clean_path = noisy_path_A.parent.parent / "labels" / noisy_path_A.name.replace("_A.wav", ".wav")
         transcript_path = str(clean_path).replace("wav", "txt")
-        waveform_noisy_A, sample_rate1 = torchaudio.load(noisy_path_A)
-        waveform_noisy_B, sample_rate2 = torchaudio.load(noisy_path_B)
+        waveform_noisy_A, sample_rate1 = load_torchcodec(noisy_path_A)
+        waveform_noisy_B, sample_rate2 = load_torchcodec(noisy_path_B)
         waveform_noisy = torch.cat((waveform_noisy_A, waveform_noisy_B), dim=0)
-        waveform_clean, sample_rate3 = torchaudio.load(clean_path)
+        waveform_clean, sample_rate3 = load_torchcodec(clean_path)
         assert sample_rate1 == _SAMPLE_RATE and sample_rate2 == _SAMPLE_RATE and sample_rate3 == _SAMPLE_RATE
         with open(transcript_path, "r") as f:
             transcript = f.readline()

@@ -12,6 +12,9 @@
 from torch import Tensor
 from torch.utils.data import BatchSampler, Dataset, DistributedSampler
 
+from torchaudio.utils import load_torchcodec
+
+
 sys.path.append("..")
 from utils import _get_label2id
 
@@ -299,7 +302,7 @@ def _load_audio(self, index: int) -> Tensor:
             (Tensor): The corresponding waveform Tensor.
         """
         wav_path = self.f_list[index]
-        waveform, sample_rate = torchaudio.load(wav_path)
+        waveform, sample_rate = load_torchcodec(wav_path)
         assert waveform.shape[1] == self.len_list[index]
         return waveform
 

@@ -13,6 +13,7 @@
 from torch.nn import Module
 
 from .common_utils import _get_feat_lens_paths
+from torchaudio.utils import load_torchcodec
 
 _LG = logging.getLogger(__name__)
 _DEFAULT_DEVICE = torch.device("cpu")
@@ -53,7 +54,7 @@ def extract_feature_mfcc(
     Returns:
         Tensor: The desired feature tensor of the given audio file.
     """
-    waveform, sr = torchaudio.load(path)
+    waveform, sr = load_torchcodec(path)
     assert sr == sample_rate
     feature_extractor = torchaudio.transforms.MFCC(
         sample_rate=sample_rate, n_mfcc=13, melkwargs={"n_fft": 400, "hop_length": 160, "center": False}
@@ -88,7 +89,7 @@ def extract_feature_hubert(
     Returns:
         Tensor: The desired feature tensor of the given audio file.
     """
-    waveform, sr = torchaudio.load(path)
+    waveform, sr = load_torchcodec(path)
     assert sr == sample_rate
     waveform = waveform.to(device)
     with torch.inference_mode():

@@ -7,7 +7,7 @@
 
 import torch
 import torchaudio
-
+from torchaudio.utils import load_torchcodec
 
 class Pipeline(torch.nn.Module):
     """Example audio process pipeline.
@@ -17,15 +17,15 @@ class Pipeline(torch.nn.Module):
 
     def __init__(self, rir_path: str):
         super().__init__()
-        rir, sample_rate = torchaudio.load(rir_path)
+        rir, sample_rate = load_torchcodec(rir_path)
         self.register_buffer("rir", rir)
         self.rir_sample_rate: int = sample_rate
 
     def forward(self, input_path: str, output_path: str):
         torchaudio.sox_effects.init_sox_effects()
 
         # 1. load audio
-        waveform, sample_rate = torchaudio.load(input_path)
+        waveform, sample_rate = load_torchcodec(input_path)
 
         # 2. Add background noise
         alpha = 0.01

@@ -14,6 +14,7 @@
 from greedy_decoder import Decoder
 from torch.utils.mobile_optimizer import optimize_for_mobile
 from torchaudio.models.wav2vec2.utils.import_fairseq import import_fairseq_model
+from torchaudio.utils import load_torchcodec
 
 TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
 if TORCH_VERSION >= (1, 10):
@@ -58,7 +59,7 @@ def _parse_args():
 
 class Loader(torch.nn.Module):
     def forward(self, audio_path: str) -> torch.Tensor:
-        waveform, sample_rate = torchaudio.load(audio_path)
+        waveform, sample_rate = load_torchcodec(audio_path)
         if sample_rate != 16000:
             waveform = torchaudio.functional.resample(waveform, float(sample_rate), 16000.0)
         return waveform

@@ -8,6 +8,7 @@
 import torchaudio
 from greedy_decoder import Decoder
 from torchaudio.models.wav2vec2.utils.import_huggingface import import_huggingface_model
+from torchaudio.utils import load_torchcodec
 
 TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
 if TORCH_VERSION >= (1, 10):
@@ -49,7 +50,7 @@ def _parse_args():
 
 class Loader(torch.nn.Module):
     def forward(self, audio_path: str) -> torch.Tensor:
-        waveform, sample_rate = torchaudio.load(audio_path)
+        waveform, sample_rate = load_torchcodec(audio_path)
         if sample_rate != 16000:
             waveform = torchaudio.functional.resample(waveform, float(sample_rate), 16000.0)
         return waveform

@@ -8,6 +8,7 @@
 import torchaudio
 from torch import Tensor
 from torch.utils.data import BatchSampler, Dataset, DistributedSampler
+from torchaudio.utils import load_torchcodec
 
 from ..lightning_modules import Batch
 
@@ -295,7 +296,7 @@ def _load_audio(self, index: int) -> Tensor:
             (Tensor): The corresponding waveform Tensor.
         """
         wav_path = self.f_list[index]
-        waveform, sample_rate = torchaudio.load(wav_path)
+        waveform, sample_rate = load_torchcodec(wav_path)
         assert waveform.shape[1] == self.len_list[index]
         return waveform
 

@@ -4,6 +4,7 @@
 import torch
 import torchaudio
 from torch.utils.data import Dataset
+from torchaudio.utils import load_torchcodec
 
 SampleType = Tuple[int, torch.Tensor, List[torch.Tensor]]
 
@@ -37,7 +38,7 @@ def __init__(
         self.files.sort()
 
     def _load_audio(self, path) -> torch.Tensor:
-        waveform, sample_rate = torchaudio.load(path)
+        waveform, sample_rate = load_torchcodec(path)
         if sample_rate != self.sample_rate:
             raise ValueError(
                 f"The dataset contains audio file of sample rate {sample_rate}, "

diff --git a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
@@ -65,6 +65,7 @@
 import matplotlib.pyplot as plt
 from torchaudio.models.decoder import ctc_decoder
 from torchaudio.utils import download_asset
+from torchaudio.utils import load_torchcodec
 
 ######################################################################
 #
@@ -98,7 +99,7 @@
 #    i really was very much afraid of showing him how much shocked i was at some parts of what he said
 #
 
-waveform, sample_rate = torchaudio.load(speech_file)
+waveform, sample_rate = load_torchcodec(speech_file)
 
 if sample_rate != bundle.sample_rate:
     waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

diff --git a/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_cuda_ctc_decoder_tutorial.py
@@ -54,6 +54,7 @@
 
 import torch
 import torchaudio
+from torchaudio.utils import load_torchcodec
 
 print(torch.__version__)
 print(torchaudio.__version__)
@@ -96,7 +97,7 @@ def download_asset_external(url, key):
 #
 
 speech_file = download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav")
-waveform, sample_rate = torchaudio.load(speech_file)
+waveform, sample_rate = load_torchcodec(speech_file)
 assert sample_rate == 16000
 IPython.display.Audio(speech_file)
 

diff --git a/examples/tutorials/audio_data_augmentation_tutorial.py b/examples/tutorials/audio_data_augmentation_tutorial.py
@@ -15,6 +15,7 @@
 
 import torch
 import torchaudio
+from torchaudio.utils import load_torchcodec
 import torchaudio.functional as F
 
 print(torch.__version__)
@@ -52,7 +53,7 @@
 #
 
 # Load the data
-waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False)
+waveform1, sample_rate = load_torchcodec(SAMPLE_WAV, channels_first=False)
 
 # Define effects
 effect = ",".join(
@@ -159,7 +160,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 # and clap your hands.
 #
 
-rir_raw, sample_rate = torchaudio.load(SAMPLE_RIR)
+rir_raw, sample_rate = load_torchcodec(SAMPLE_RIR)
 plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)")
 plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
 Audio(rir_raw, rate=sample_rate)
@@ -179,7 +180,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 # we convolve the speech signal with the RIR.
 #
 
-speech, _ = torchaudio.load(SAMPLE_SPEECH)
+speech, _ = load_torchcodec(SAMPLE_SPEECH)
 augmented = F.fftconvolve(speech, rir)
 
 ######################################################################
@@ -219,8 +220,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 # To add noise to audio data per SNRs, we
 # use :py:func:`torchaudio.functional.add_noise`.
 
-speech, _ = torchaudio.load(SAMPLE_SPEECH)
-noise, _ = torchaudio.load(SAMPLE_NOISE)
+speech, _ = load_torchcodec(SAMPLE_SPEECH)
+noise, _ = load_torchcodec(SAMPLE_NOISE)
 noise = noise[:, : speech.shape[1]]
 
 snr_dbs = torch.tensor([20, 10, 3])
@@ -275,7 +276,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 # a Tensor object.
 #
 
-waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False)
+waveform, sample_rate = load_torchcodec(SAMPLE_SPEECH, channels_first=False)
 
 
 def apply_codec(waveform, sample_rate, format, encoder=None):
@@ -332,7 +333,7 @@ def apply_codec(waveform, sample_rate, format, encoder=None):
 #
 
 sample_rate = 16000
-original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH)
+original_speech, sample_rate = load_torchcodec(SAMPLE_SPEECH)
 
 plot_specgram(original_speech, sample_rate, title="Original")
 
@@ -345,7 +346,7 @@ def apply_codec(waveform, sample_rate, format, encoder=None):
 # Because the noise is recorded in the actual environment, we consider that
 # the noise contains the acoustic feature of the environment. Therefore, we add
 # the noise after RIR application.
-noise, _ = torchaudio.load(SAMPLE_NOISE)
+noise, _ = load_torchcodec(SAMPLE_NOISE)
 noise = noise[:, : rir_applied.shape[1]]
 
 snr_db = torch.tensor([8])