Switch to runtime model download for video generation workers

ConalMullan · claude · ConalMullan · commit dbc09692f691 · 2025-12-31T19:16:35.000Z
GitHub Actions runners have insufficient disk (~14GB) for baking 30-40GB models into Docker images at build time. Changes: - Models downloaded at container startup, cached to network volume - New download_models.py handles HuggingFace downloads with caching - Image size reduced from ~30GB to ~8GB - First cold start: 5-10 min (download), subsequent: ~30s (cached) Requires RunPod Network Volume for cost-effective operation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/docker/runpod-qwen-edit/Dockerfile b/docker/runpod-qwen-edit/Dockerfile
@@ -1,10 +1,13 @@
 # RunPod Serverless handler for Qwen-Image-Edit with LightX2V acceleration
 #
-# Build: docker buildx build --platform linux/amd64 -t ghcr.io/conalmullan/video-toolkit-qwen-edit:latest --push .
+# Build: docker buildx build --platform linux/amd64 -t ghcr.io/digitalsamba/video-toolkit-qwen-edit:latest --push .
 #
-# Image size: ~25GB (includes pre-baked model weights for fast cold starts)
+# Image size: ~8GB (models downloaded at runtime, cached to network volume)
 #
-# Version: 1.0.0 - CUDA 12.4, PyTorch 2.5, LightX2V + Qwen-Image-Edit-2511-Lightning
+# Version: 1.1.0 - CUDA 12.4, PyTorch 2.5, LightX2V + runtime model download
+#
+# Model Download: ~30GB downloaded on first run, cached to /runpod-volume/models/
+# Cold start: ~5-10 min (first run), ~30s (cached)
 #
 # GPU Requirements:
 #   - Minimum: 24GB VRAM (L4, RTX 4090) with FP8 quantization
@@ -76,53 +79,26 @@ RUN pip3 install --no-cache-dir \
 
 WORKDIR /app
 
-# Create models directory
-RUN mkdir -p /models/qwen-edit /models/qwen-edit-fp8
-
-# Download Qwen-Image-Edit-2511 base model (~20GB)
-# Uses HF hub to cache in standard location
-RUN python3 -c "\
-from huggingface_hub import snapshot_download; \
-print('Downloading Qwen-Image-Edit-2511 base model...'); \
-snapshot_download( \
-    repo_id='Qwen/Qwen-Image-Edit-2511', \
-    local_dir='/models/qwen-edit', \
-    ignore_patterns=['*.md', '*.txt', '.gitattributes'] \
-); \
-print('Base model downloaded successfully'); \
-"
-
-# Download FP8 quantized Lightning weights (~10GB)
-RUN python3 -c "\
-from huggingface_hub import snapshot_download; \
-print('Downloading FP8 Lightning weights...'); \
-snapshot_download( \
-    repo_id='lightx2v/Qwen-Image-Edit-2511-Lightning', \
-    local_dir='/models/qwen-edit-fp8', \
-    ignore_patterns=['*.md', '*.txt', '.gitattributes'] \
-); \
-print('FP8 weights downloaded successfully'); \
-"
-
-# Copy handler
+# Copy handler and download script
 COPY handler.py /app/handler.py
+COPY download_models.py /app/download_models.py
 
-# Environment
+# Environment - models will be downloaded to network volume at runtime
 ENV PYTHONUNBUFFERED=1
-ENV HF_HOME=/root/.cache/huggingface
+ENV HF_HOME=/runpod-volume/.cache/huggingface
 ENV LIGHTX2V_PATH=/app/lightx2v
-ENV MODEL_PATH=/models/qwen-edit
-ENV FP8_WEIGHTS_PATH=/models/qwen-edit-fp8
+# These paths will be set by handler based on network volume availability
+ENV MODEL_BASE_PATH=/runpod-volume/models
+ENV MODEL_PATH=/runpod-volume/models/qwen-edit
+ENV FP8_WEIGHTS_PATH=/runpod-volume/models/qwen-edit-fp8
 
-# Health check - verify imports and model paths
+# Health check - verify imports only (models downloaded at runtime)
 RUN python3 -c "\
 import torch; \
 print(f'PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}'); \
 from lightx2v import LightX2VPipeline; \
 print('LightX2V import OK'); \
-from pathlib import Path; \
-print(f'Model path exists: {Path(\"/models/qwen-edit\").exists()}'); \
-print(f'FP8 weights exist: {Path(\"/models/qwen-edit-fp8\").exists()}'); \
+print('Models will be downloaded at runtime to /runpod-volume/models/'); \
 "
 
 # Run handler
diff --git a/docker/runpod-qwen-edit/download_models.py b/docker/runpod-qwen-edit/download_models.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""
+Model download script for Qwen-Image-Edit with LightX2V.
+
+Downloads models to network volume for caching across cold starts.
+Supports both RunPod network volumes and local fallback.
+"""
+
+import os
+import sys
+from pathlib import Path
+from huggingface_hub import snapshot_download
+
+
+def get_model_paths():
+    """Determine model paths based on available storage."""
+    # Check for RunPod network volume first
+    if Path("/runpod-volume").exists() and os.access("/runpod-volume", os.W_OK):
+        base_path = Path("/runpod-volume/models")
+        cache_path = Path("/runpod-volume/.cache/huggingface")
+        print("Using RunPod network volume for model storage")
+    else:
+        # Fallback to local storage (container ephemeral storage)
+        base_path = Path("/models")
+        cache_path = Path("/root/.cache/huggingface")
+        print("WARNING: No network volume found, using ephemeral storage")
+        print("Models will be re-downloaded on each cold start!")
+
+    base_path.mkdir(parents=True, exist_ok=True)
+    cache_path.mkdir(parents=True, exist_ok=True)
+
+    # Set HF cache location
+    os.environ["HF_HOME"] = str(cache_path)
+
+    return {
+        "base_path": base_path,
+        "model_path": base_path / "qwen-edit",
+        "fp8_path": base_path / "qwen-edit-fp8",
+    }
+
+
+def check_model_exists(path: Path, min_files: int = 5) -> bool:
+    """Check if model directory has enough files to be considered complete."""
+    if not path.exists():
+        return False
+    files = list(path.glob("*"))
+    return len(files) >= min_files
+
+
+def download_base_model(model_path: Path) -> bool:
+    """Download Qwen-Image-Edit-2511 base model."""
+    if check_model_exists(model_path, min_files=10):
+        print(f"Base model already exists at {model_path}")
+        return True
+
+    print("Downloading Qwen-Image-Edit-2511 base model (~20GB)...")
+    print("This may take 5-10 minutes on first run.")
+
+    try:
+        snapshot_download(
+            repo_id="Qwen/Qwen-Image-Edit-2511",
+            local_dir=str(model_path),
+            ignore_patterns=["*.md", "*.txt", ".gitattributes"],
+        )
+        print("Base model downloaded successfully")
+        return True
+    except Exception as e:
+        print(f"ERROR downloading base model: {e}")
+        return False
+
+
+def download_fp8_weights(fp8_path: Path) -> bool:
+    """Download FP8 quantized Lightning weights."""
+    if check_model_exists(fp8_path, min_files=3):
+        print(f"FP8 weights already exist at {fp8_path}")
+        return True
+
+    print("Downloading FP8 Lightning weights (~10GB)...")
+
+    try:
+        snapshot_download(
+            repo_id="lightx2v/Qwen-Image-Edit-2511-Lightning",
+            local_dir=str(fp8_path),
+            ignore_patterns=["*.md", "*.txt", ".gitattributes"],
+        )
+        print("FP8 weights downloaded successfully")
+        return True
+    except Exception as e:
+        print(f"ERROR downloading FP8 weights: {e}")
+        return False
+
+
+def ensure_models_downloaded() -> dict:
+    """
+    Ensure all required models are downloaded.
+
+    Returns dict with model paths if successful, raises exception if not.
+    """
+    paths = get_model_paths()
+
+    # Download base model
+    if not download_base_model(paths["model_path"]):
+        raise RuntimeError("Failed to download base model")
+
+    # Download FP8 weights
+    if not download_fp8_weights(paths["fp8_path"]):
+        raise RuntimeError("Failed to download FP8 weights")
+
+    print(f"\nAll models ready:")
+    print(f"  Base model: {paths['model_path']}")
+    print(f"  FP8 weights: {paths['fp8_path']}")
+
+    return paths
+
+
+if __name__ == "__main__":
+    # Can be run standalone to pre-download models
+    try:
+        paths = ensure_models_downloaded()
+        print("\nModel download complete!")
+        sys.exit(0)
+    except Exception as e:
+        print(f"\nModel download failed: {e}")
+        sys.exit(1)
diff --git a/docker/runpod-qwen-edit/handler.py b/docker/runpod-qwen-edit/handler.py
@@ -48,9 +48,9 @@
 import torch
 from PIL import Image
 
-# Model paths (baked into Docker image)
-MODEL_PATH = Path(os.environ.get("MODEL_PATH", "/models/qwen-edit"))
-FP8_WEIGHTS_PATH = Path(os.environ.get("FP8_WEIGHTS_PATH", "/models/qwen-edit-fp8"))
+# Model paths - set after runtime download
+MODEL_PATH = None
+FP8_WEIGHTS_PATH = None
 
 # Lazy-loaded pipeline
 _pipeline = None
@@ -62,6 +62,23 @@ def log(message: str) -> None:
     print(message, file=sys.stderr, flush=True)
 
 
+def ensure_models() -> None:
+    """Ensure models are downloaded before first use."""
+    global MODEL_PATH, FP8_WEIGHTS_PATH
+
+    if MODEL_PATH is not None and MODEL_PATH.exists():
+        return  # Already initialized
+
+    log("Checking/downloading models...")
+    from download_models import ensure_models_downloaded
+
+    paths = ensure_models_downloaded()
+    MODEL_PATH = paths["model_path"]
+    FP8_WEIGHTS_PATH = paths["fp8_path"]
+
+    log(f"Models ready: {MODEL_PATH}")
+
+
 def get_gpu_vram_gb() -> int:
     """Detect GPU VRAM using PyTorch."""
     try:
@@ -102,6 +119,9 @@ def get_pipeline(use_fp8: bool = True):
     """Get or initialize LightX2V pipeline (lazy loading)."""
     global _pipeline, _pipeline_config
 
+    # Ensure models are downloaded first
+    ensure_models()
+
     # Check if we need to reinitialize (different config)
     current_config = {"use_fp8": use_fp8}
     if _pipeline is not None and _pipeline_config == current_config:
@@ -348,15 +368,20 @@ def handler(job: dict) -> dict:
 # RunPod serverless entry point
 if __name__ == "__main__":
     log("Starting RunPod Qwen-Edit handler...")
-    log(f"Model path: {MODEL_PATH}, exists: {MODEL_PATH.exists()}")
-    log(f"FP8 weights path: {FP8_WEIGHTS_PATH}, exists: {FP8_WEIGHTS_PATH.exists()}")
 
-    # Check CUDA
+    # Check CUDA first
     if torch.cuda.is_available():
         log(f"CUDA available: {torch.cuda.get_device_name(0)}")
         vram_gb = get_gpu_vram_gb()
         log(f"VRAM: {vram_gb}GB")
     else:
         log("WARNING: CUDA not available!")
 
+    # Download models at startup (before serverless loop)
+    # This happens during container initialization, not during job execution
+    log("Downloading models at startup (this may take 5-10 min on first run)...")
+    ensure_models()
+    log(f"Model path: {MODEL_PATH}")
+    log(f"FP8 weights path: {FP8_WEIGHTS_PATH}")
+
     runpod.serverless.start({"handler": handler})
diff --git a/docker/runpod-wan-i2v/Dockerfile b/docker/runpod-wan-i2v/Dockerfile
@@ -1,10 +1,13 @@
 # RunPod Serverless handler for Wan2.2 Image-to-Video with LightX2V acceleration
 #
-# Build: docker buildx build --platform linux/amd64 -t ghcr.io/conalmullan/video-toolkit-wan-i2v:latest --push .
+# Build: docker buildx build --platform linux/amd64 -t ghcr.io/digitalsamba/video-toolkit-wan-i2v:latest --push .
 #
-# Image size: ~40GB (includes pre-baked model weights for fast cold starts)
+# Image size: ~8GB (models downloaded at runtime, cached to network volume)
 #
-# Version: 1.0.0 - CUDA 12.4, PyTorch 2.5, LightX2V + Wan2.2-I2V-A14B
+# Version: 1.1.0 - CUDA 12.4, PyTorch 2.5, LightX2V + runtime model download
+#
+# Model Download: ~35GB downloaded on first run, cached to /runpod-volume/models/
+# Cold start: ~10-15 min (first run), ~30s (cached)
 #
 # GPU Requirements:
 #   - Minimum: 24GB VRAM (L4, RTX 4090) with aggressive offloading
@@ -78,39 +81,24 @@ RUN pip3 install --no-cache-dir \
 
 WORKDIR /app
 
-# Create models directory
-RUN mkdir -p /models/wan-i2v
-
-# Download Wan2.2-I2V-A14B model (~35GB)
-# MoE (Mixture of Experts) architecture - 14B active parameters
-RUN python3 -c "\
-from huggingface_hub import snapshot_download; \
-print('Downloading Wan2.2-I2V-A14B model (this will take a while)...'); \
-snapshot_download( \
-    repo_id='Wan-AI/Wan2.2-I2V-A14B', \
-    local_dir='/models/wan-i2v', \
-    ignore_patterns=['*.md', '*.txt', '.gitattributes'] \
-); \
-print('Wan2.2 model downloaded successfully'); \
-"
-
-# Copy handler
+# Copy handler and download script
 COPY handler.py /app/handler.py
+COPY download_models.py /app/download_models.py
 
-# Environment
+# Environment - models will be downloaded to network volume at runtime
 ENV PYTHONUNBUFFERED=1
-ENV HF_HOME=/root/.cache/huggingface
+ENV HF_HOME=/runpod-volume/.cache/huggingface
 ENV LIGHTX2V_PATH=/app/lightx2v
-ENV MODEL_PATH=/models/wan-i2v
+ENV MODEL_BASE_PATH=/runpod-volume/models
+ENV MODEL_PATH=/runpod-volume/models/wan-i2v
 
-# Health check - verify imports and model paths
+# Health check - verify imports only (models downloaded at runtime)
 RUN python3 -c "\
 import torch; \
 print(f'PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}'); \
 from lightx2v import LightX2VPipeline; \
 print('LightX2V import OK'); \
-from pathlib import Path; \
-print(f'Model path exists: {Path(\"/models/wan-i2v\").exists()}'); \
+print('Models will be downloaded at runtime to /runpod-volume/models/'); \
 "
 
 # Run handler
diff --git a/docker/runpod-wan-i2v/download_models.py b/docker/runpod-wan-i2v/download_models.py
diff --git a/docker/runpod-wan-i2v/handler.py b/docker/runpod-wan-i2v/handler.py