24
24
CLIPTextModel ,
25
25
CLIPTokenizer ,
26
26
LlamaConfig ,
27
- LlamaModel ,
28
- LlamaTokenizer ,
27
+ LlamaTokenizerFast ,
28
+ LlavaConfig ,
29
+ LlavaForConditionalGeneration ,
29
30
)
31
+ from transformers .models .clip import CLIPVisionConfig
30
32
31
33
from diffusers import (
32
34
AutoencoderKLHunyuanVideo ,
@@ -116,19 +118,29 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
116
118
torch .manual_seed (0 )
117
119
scheduler = FlowMatchEulerDiscreteScheduler (shift = 7.0 )
118
120
119
- llama_text_encoder_config = LlamaConfig (
121
+ text_config = LlamaConfig (
120
122
bos_token_id = 0 ,
121
123
eos_token_id = 2 ,
122
124
hidden_size = 16 ,
123
125
intermediate_size = 37 ,
124
126
layer_norm_eps = 1e-05 ,
125
127
num_attention_heads = 4 ,
126
128
num_hidden_layers = 2 ,
127
- pad_token_id = 1 ,
129
+ pad_token_id = 100 ,
128
130
vocab_size = 1000 ,
129
131
hidden_act = "gelu" ,
130
132
projection_dim = 32 ,
131
133
)
134
+ vision_config = CLIPVisionConfig (
135
+ hidden_size = 8 ,
136
+ intermediate_size = 37 ,
137
+ projection_dim = 32 ,
138
+ num_attention_heads = 4 ,
139
+ num_hidden_layers = 2 ,
140
+ image_size = 224 ,
141
+ )
142
+ llava_text_encoder_config = LlavaConfig (vision_config , text_config , pad_token_id = 100 , image_token_index = 101 )
143
+
132
144
clip_text_encoder_config = CLIPTextConfig (
133
145
bos_token_id = 0 ,
134
146
eos_token_id = 2 ,
@@ -144,23 +156,23 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
144
156
)
145
157
146
158
torch .manual_seed (0 )
147
- text_encoder = LlamaModel ( llama_text_encoder_config )
148
- tokenizer = LlamaTokenizer .from_pretrained ("finetrainers/dummy-hunyaunvideo" , subfolder = "tokenizer" )
159
+ text_encoder = LlavaForConditionalGeneration ( llava_text_encoder_config )
160
+ tokenizer = LlamaTokenizerFast .from_pretrained ("finetrainers/dummy-hunyaunvideo" , subfolder = "tokenizer" )
149
161
150
162
torch .manual_seed (0 )
151
163
text_encoder_2 = CLIPTextModel (clip_text_encoder_config )
152
164
tokenizer_2 = CLIPTokenizer .from_pretrained ("hf-internal-testing/tiny-random-clip" )
153
165
154
166
torch .manual_seed (0 )
155
167
image_processor = CLIPImageProcessor (
156
- crop_size = 336 ,
168
+ crop_size = 224 ,
157
169
do_center_crop = True ,
158
170
do_normalize = True ,
159
171
do_resize = True ,
160
172
image_mean = [0.48145466 , 0.4578275 , 0.40821073 ],
161
173
image_std = [0.26862954 , 0.26130258 , 0.27577711 ],
162
174
resample = 3 ,
163
- size = 336 ,
175
+ size = 224 ,
164
176
)
165
177
166
178
components = {
@@ -190,14 +202,18 @@ def get_dummy_inputs(self, device, seed=0):
190
202
"prompt_template" : {
191
203
"template" : "{}" ,
192
204
"crop_start" : 0 ,
205
+ "image_emb_len" : 49 ,
206
+ "image_emb_start" : 5 ,
207
+ "image_emb_end" : 54 ,
208
+ "double_return_token_id" : 0 ,
193
209
},
194
210
"generator" : generator ,
195
211
"num_inference_steps" : 2 ,
196
212
"guidance_scale" : 4.5 ,
197
213
"height" : image_height ,
198
214
"width" : image_width ,
199
215
"num_frames" : 9 ,
200
- "max_sequence_length" : 16 ,
216
+ "max_sequence_length" : 64 ,
201
217
"output_type" : "pt" ,
202
218
}
203
219
return inputs
0 commit comments