Skip to content

Conversation

naomili0924
Copy link

@naomili0924 naomili0924 commented Sep 9, 2025

What does this PR do?

This PR is addressing the issue #9329 by combining the controlnet and sparse controlnet into a single pipeline.

Fixes # (issue)

Before submitting

To test this pipeline, please uses the following codes:

`import os
import random
import torch
from PIL import Image
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, "/content/drive/MyDrive/diffusers/src")

from diffusers import (
  AnimateDiffHybridControlNetPipeline,
  AutoencoderKL,
  ControlNetModel,
  SparseControlNetModel,
  MotionAdapter,
  DPMSolverMultistepScheduler, # LCMScheduler
  LCMScheduler,
)
from diffusers.utils import export_to_gif, load_video
# Additionally, you will need a preprocess videos before they can be used with the ControlNet
# HF maintains just the right package for it: `pip install controlnet_aux`
from controlnet_aux.processor import ZoeDetector

prompt = "one man is skating on the ice"
negative_prompt = "bad quality, worst quality"

model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
motion_adapter_id = "wangfuyun/AnimateLCM"
controlnet_id = "lllyasviel/sd-controlnet-depth"
lora_adapter_id = "wangfuyun/AnimateLCM"
vae_id = "stabilityai/sd-vae-ft-mse"
device = "cuda"

# Load pipeline
vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
controlnet = ControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
# We use AnimateLCM for this example but one can use the original motion adapters as well (for example, https://huggingface.co/guoyww/animatediff-motion-adapter-v1-5-3)
motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
pipe = AnimateDiffHybridControlNetPipeline.from_pretrained(
  model_id,
  controlnet=controlnet,
  motion_adapter=motion_adapter,
  vae=vae,
).to(device=device, dtype=torch.float16)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
pipe.load_lora_weights(lora_adapter_id, weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
pipe.set_adapters(["lcm-lora"], [0.8])

# Load conditioning frames
depth_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
num_frames = 16
# Path to your frames folder
frames_folder = "./frames"
# Sort the frames by filename to preserve order
frame_files = sorted(os.listdir(frames_folder))[:num_frames]
# Load frames into a list of PIL Images
frames = [Image.open(os.path.join(frames_folder, f)) for f in frame_files if f.endswith((".png", ".jpg", ".jpeg", ".gif"))]
print(f"Loaded {len(frames)} frames")
conditioning_frames = []
with pipe.progress_bar(total=len(frames)) as progress_bar:
    for frame in frames:
        conditioning_frames.append(depth_detector(frame))
        progress_bar.update()
video = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    num_frames=len(conditioning_frames),
    num_inference_steps=50,
    guidance_scale=2.0,
    conditioning_frames=conditioning_frames,
    generator=torch.Generator().manual_seed(42),
).frames[0]

export_to_gif(video, "animatediff_hybrid_controlnet.gif", fps=8)

motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
controlnet_id = "guoyww/animatediff-sparsectrl-rgb"
lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
scheduler = DPMSolverMultistepScheduler.from_pretrained(
    model_id,
    subfolder="scheduler",
    beta_schedule="linear",
    algorithm_type="dpmsolver++",
    use_karras_sigmas=True,
)
pipe = AnimateDiffHybridControlNetPipeline.from_pretrained(
    model_id,
    motion_adapter=motion_adapter,
    controlnet=controlnet,
    vae=vae,
    scheduler=scheduler,
    torch_dtype=torch.float16,
).to(device)
pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
pipe.fuse_lora(lora_scale=1.0)

condition_frame_indices = [0, 4, 8, 10, 15]
conditioning_frames = [frames[index] for index in condition_frame_indices]
video = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    num_frames=num_frames,
    num_inference_steps=25,
    conditioning_frames=conditioning_frames,
    controlnet_conditioning_scale=1.0,
    controlnet_frame_indices=condition_frame_indices,
    generator=torch.Generator().manual_seed(1337),
).frames[0]

export_to_gif(video, "animatediff_hybrid_sparse_controlnet.gif", fps=8)


Who can review?

Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
members/contributors who may be interested in your PR.

@naomili0924 naomili0924 force-pushed the hybrid_controlnet branch 5 times, most recently from 8f34020 to bbd6341 Compare September 9, 2025 05:08
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant