feat: add threaded I/O pipeline for video processing

AnonymDevOSS · AnonymDevOSS · commit 98476dfacffe · 2025-10-28T14:05:37.000+01:00
Implements pipeline with bounded queues to overlap decode,
compute and encode. Reduces I/O stalls.
diff --git a/supervision/utils/video.py b/supervision/utils/video.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
+import threading
 import time
 from collections import deque
 from collections.abc import Callable, Generator
 from dataclasses import dataclass
+from queue import Queue
 
 import cv2
 import numpy as np
@@ -255,6 +257,130 @@ def callback(scene: np.ndarray, index: int) -> np.ndarray:
                 sink.write_frame(frame=result_frame)
 
 
+def process_video_threads(
+    source_path: str,
+    target_path: str,
+    callback: Callable[[np.ndarray, int], np.ndarray],
+    *,
+    max_frames: int | None = None,
+    prefetch: int = 32,
+    writer_buffer: int = 32,
+    show_progress: bool = False,
+    progress_message: str = "Processing video (with threads)",
+) -> None:
+    """
+    Process a video using a threaded pipeline that asynchronously
+    reads frames, applies a callback to each, and writes the results
+    to an output file.
+
+    Overview:
+    This function implements a three-stage pipeline designed to maximize
+    frame throughput.
+
+        │   Reader   │ >> │  Processor   │ >> │   Writer   │
+           (thread)           (main)             (thread)
+
+    - Reader thread: reads frames from disk into a bounded queue ('read_q')
+      until full, then blocks. This ensures we never load more than 'prefetch'
+      frames into memory at once.
+
+    - Main thread: dequeues frames, applies the 'callback(frame, idx)',
+      and enqueues the processed result into 'write_q'.
+      This is the compute stage. It's important to note that it's not threaded,
+      so you can safely use any detectors, trackers, or other stateful objects
+      without synchronization issues.
+
+    - Writer thread: dequeues frames and writes them to disk.
+
+    Both queues are bounded to enforce back-pressure:
+      - The reader cannot outpace processing (avoids unbounded RAM usage).
+      - The processor cannot outpace writing (avoids output buffer bloat).
+
+    Summary:
+    - It's thread-safe: because the callback runs only in the main thread,
+    using a single stateful detector/tracker inside callback does not require
+    synchronization with the reader/writer threads.
+
+    - While the main thread processes frame N, the reader is already decoding frame N+1,
+      and the writer is encoding frame N-1. They operate concurrently without blocking
+      each other.
+
+    - When is it fastest?
+        - When there's heavy computation in the callback function that releases
+          the Python GIL (for example, OpenCV filters, resizes, color conversions, ...)
+        - When using CUDA or GPU-accelerated inference.
+
+    - When is it better not to use it?
+        - When the callback function is Python-heavy and GIL-bound. In that case,
+          using a process-based approach is more effective.
+
+    Args:
+        source_path (str): The path to the source video file.
+        target_path (str): The path to the target video file.
+        callback (Callable[[np.ndarray, int], np.ndarray]): A function that takes in
+            a numpy ndarray representation of a video frame and an
+            int index of the frame and returns a processed numpy ndarray
+            representation of the frame.
+        max_frames (Optional[int]): The maximum number of frames to process.
+        prefetch (int): The maximum number of frames buffered by the reader thread.
+        writer_buffer (int): The maximum number of frames buffered before writing.
+        show_progress (bool): Whether to show a progress bar.
+        progress_message (str): The message to display in the progress bar.
+    """
+
+    source_video_info = VideoInfo.from_video_path(video_path=source_path)
+    total_frames = (
+        min(source_video_info.total_frames, max_frames)
+        if max_frames is not None
+        else source_video_info.total_frames
+    )
+
+    # Each queue includes frames + sentinel
+    read_q: Queue[tuple[int, np.ndarray] | None] = Queue(maxsize=prefetch)
+    write_q: Queue[np.ndarray | None] = Queue(maxsize=writer_buffer)
+
+    def reader_thread():
+        gen = get_video_frames_generator(source_path=source_path, end=max_frames)
+        for idx, frame in enumerate(gen):
+            read_q.put((idx, frame))
+        read_q.put(None)  # sentinel
+
+    def writer_thread(video_sink: VideoSink):
+        while True:
+            frame = write_q.get()
+            if frame is None:
+                break
+            video_sink.write_frame(frame=frame)
+
+    # Heads up! We set 'daemon=True' so this thread won't block program exit
+    # if the main thread finishes first.
+    t_reader = threading.Thread(target=reader_thread, daemon=True)
+    with VideoSink(target_path=target_path, video_info=source_video_info) as sink:
+        t_writer = threading.Thread(target=writer_thread, args=(sink,), daemon=True)
+        t_reader.start()
+        t_writer.start()
+
+        process_bar = tqdm(
+            total=total_frames, disable=not show_progress, desc=progress_message
+        )
+
+        # Main thread: we take a frame, apply function and update process bar.
+        while True:
+            item = read_q.get()
+            if item is None:
+                break
+            idx, frame = item
+            out = callback(frame, idx)
+            write_q.put(out)
+            if total_frames is not None:
+                process_bar.update(1)
+
+        write_q.put(None)
+        t_reader.join()
+        t_writer.join()
+        process_bar.close()
+
+
 class FPSMonitor:
     """
     A class for monitoring frames per second (FPS) to benchmark latency.