Skip to content

Commit 59f1b7b

Browse files
authored
Hunyuan I2V fast tests fix (#11341)
* update * update
1 parent ce1063a commit 59f1b7b

File tree

2 files changed

+26
-10
lines changed

2 files changed

+26
-10
lines changed

src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def _get_llama_prompt_embeds(
344344
)
345345
prompt_embeds = self.text_encoder(
346346
**expanded_inputs,
347-
pixel_value=image_embeds,
347+
pixel_values=image_embeds,
348348
output_hidden_states=True,
349349
).hidden_states[-(num_hidden_layers_to_skip + 1)]
350350
prompt_embeds = prompt_embeds.to(dtype=dtype)

tests/pipelines/hunyuan_video/test_hunyuan_image2video.py

+25-9
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@
2424
CLIPTextModel,
2525
CLIPTokenizer,
2626
LlamaConfig,
27-
LlamaModel,
28-
LlamaTokenizer,
27+
LlamaTokenizerFast,
28+
LlavaConfig,
29+
LlavaForConditionalGeneration,
2930
)
31+
from transformers.models.clip import CLIPVisionConfig
3032

3133
from diffusers import (
3234
AutoencoderKLHunyuanVideo,
@@ -116,19 +118,29 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
116118
torch.manual_seed(0)
117119
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
118120

119-
llama_text_encoder_config = LlamaConfig(
121+
text_config = LlamaConfig(
120122
bos_token_id=0,
121123
eos_token_id=2,
122124
hidden_size=16,
123125
intermediate_size=37,
124126
layer_norm_eps=1e-05,
125127
num_attention_heads=4,
126128
num_hidden_layers=2,
127-
pad_token_id=1,
129+
pad_token_id=100,
128130
vocab_size=1000,
129131
hidden_act="gelu",
130132
projection_dim=32,
131133
)
134+
vision_config = CLIPVisionConfig(
135+
hidden_size=8,
136+
intermediate_size=37,
137+
projection_dim=32,
138+
num_attention_heads=4,
139+
num_hidden_layers=2,
140+
image_size=224,
141+
)
142+
llava_text_encoder_config = LlavaConfig(vision_config, text_config, pad_token_id=100, image_token_index=101)
143+
132144
clip_text_encoder_config = CLIPTextConfig(
133145
bos_token_id=0,
134146
eos_token_id=2,
@@ -144,23 +156,23 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
144156
)
145157

146158
torch.manual_seed(0)
147-
text_encoder = LlamaModel(llama_text_encoder_config)
148-
tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
159+
text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config)
160+
tokenizer = LlamaTokenizerFast.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
149161

150162
torch.manual_seed(0)
151163
text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
152164
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
153165

154166
torch.manual_seed(0)
155167
image_processor = CLIPImageProcessor(
156-
crop_size=336,
168+
crop_size=224,
157169
do_center_crop=True,
158170
do_normalize=True,
159171
do_resize=True,
160172
image_mean=[0.48145466, 0.4578275, 0.40821073],
161173
image_std=[0.26862954, 0.26130258, 0.27577711],
162174
resample=3,
163-
size=336,
175+
size=224,
164176
)
165177

166178
components = {
@@ -190,14 +202,18 @@ def get_dummy_inputs(self, device, seed=0):
190202
"prompt_template": {
191203
"template": "{}",
192204
"crop_start": 0,
205+
"image_emb_len": 49,
206+
"image_emb_start": 5,
207+
"image_emb_end": 54,
208+
"double_return_token_id": 0,
193209
},
194210
"generator": generator,
195211
"num_inference_steps": 2,
196212
"guidance_scale": 4.5,
197213
"height": image_height,
198214
"width": image_width,
199215
"num_frames": 9,
200-
"max_sequence_length": 16,
216+
"max_sequence_length": 64,
201217
"output_type": "pt",
202218
}
203219
return inputs

0 commit comments

Comments
 (0)