Qihoo360
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎README.md
+123-1 b/‎README.md
+123-1
diff --git a/‎configs/inference/i2v.yaml
+26 b/‎configs/inference/i2v.yaml
+26
diff --git a/‎configs/inference/t2v_pixars.yaml
+26 b/‎configs/inference/t2v_pixars.yaml
+26
diff --git a/‎configs/inference/t2v_realcartoon3d.yaml
+26 b/‎configs/inference/t2v_realcartoon3d.yaml
+26
diff --git a/‎configs/inference/t2v_toonyou.yaml
+26 b/‎configs/inference/t2v_toonyou.yaml
+26
diff --git a/‎fancyvideo/models/__pycache__/motion_module.cpython-310.pyc
8.4 KB b/‎fancyvideo/models/__pycache__/motion_module.cpython-310.pyc
8.4 KB
diff --git a/‎fancyvideo/models/__pycache__/resnet.cpython-310.pyc
5.31 KB b/‎fancyvideo/models/__pycache__/resnet.cpython-310.pyc
5.31 KB
diff --git a/‎fancyvideo/models/__pycache__/unet.cpython-310.pyc
14.8 KB b/‎fancyvideo/models/__pycache__/unet.cpython-310.pyc
14.8 KB
diff --git a/‎fancyvideo/models/__pycache__/unet_blocks.cpython-310.pyc
13.5 KB b/‎fancyvideo/models/__pycache__/unet_blocks.cpython-310.pyc
13.5 KB
diff --git a/‎fancyvideo/models/ctgm/__pycache__/attention.cpython-310.pyc
16.3 KB b/‎fancyvideo/models/ctgm/__pycache__/attention.cpython-310.pyc
16.3 KB
diff --git a/‎fancyvideo/models/ctgm/__pycache__/temporal_cross_module.cpython-310.pyc
7.59 KB b/‎fancyvideo/models/ctgm/__pycache__/temporal_cross_module.cpython-310.pyc
7.59 KB
@@ -0,0 +1 @@
+resources/
@@ -1,2 +1,124 @@
 # FancyVideo
-This is the official reproduction of FancyVideo.
+
+This repository is the official implementation of [FancyVideo](https://360cvgroup.github.io/FancyVideo/).
+
+**[FancyVideo: Towards Dynamic and Consistent Video Generation via Cross-frame Textual Guidance](https://arxiv.org/abs/2408.08189)** 
+</br>
+Jiasong Feng*, Ao Ma*, Jing Wang*, Bo Cheng, Xiaodan Liang, Dawei Leng†, Yuhui Yin(*Equal Contribution, ✝Corresponding Author)
+</br>
+[![arXiv](https://img.shields.io/badge/arXiv-2307.04725-b31b1b.svg)](https://arxiv.org/abs/2408.08189)
+[![Project Page](https://img.shields.io/badge/Project-Website-green)](https://360cvgroup.github.io/FancyVideo/)
+
+Our code builds upon [AnimateDiff](https://github.com/guoyww/AnimateDiff), and we also incorporate insights from [CV-VAE](https://github.com/AILab-CVC/CV-VAE), [Res-Adapter](https://github.com/bytedance/res-adapter), and [Long-CLIP](https://github.com/beichenzbc/Long-CLIP) to enhance our project. We appreciate the open-source contributions of these works.
+
+
+## 🔥 News
+- **[2024/08/19]** We initialized this github repository and released the inference code and 61-frame model.
+- **[2024/08/15]** We released the paper of [FancyVideo](https://arxiv.org/abs/2408.08189).
+
+
+## Quick Demos
+Video demos can be found in the [webpage](https://360cvgroup.github.io/FancyVideo/). Some of them are contributed by the community. You can customize your own videos using the following reasoning code.
+
+
+## Quick Start
+### 0. Experimental environment
+We tested our inference code on a machine with a 24GB 3090 GPU and CUDA environment version 12.1.
+
+### 1. Setup repository and environment
+```
+git clone https://github.com/360CVGroup/FancyVideo.git
+cd FancyVideo
+
+conda create -n fancyvideo python=3.10
+conda activate fancyvideo
+pip install -r requirements.txt
+```
+
+### 2. Prepare the models
+```
+mkdir resources/models
+
+# fancyvideo-ckpts
+wget -O resources/models/fancyvideo_ckpts.zip "https://drive.google.com/uc?export=download&id=1m4UqKVQ3POI5ei1A9yppHX_H--8PKMtn"
+unzip resources/models/fancyvideo_ckpts.zip
+
+# cv-vae
+wget -O resources/models/CV-VAE.zip "https://drive.google.com/uc?export=download&id=1Xal1fxVbVWf0jjiPK5gb_1-lOh0w8G_r"
+unzip resources/models/CV-VAE.zip
+
+# res-adapter
+wget -O resources/models/res-adapter.zip "https://drive.google.com/uc?export=download&id=18EawVd1HJtrQds703sLqoYZtLfbUgLm4"
+unzip resources/models/res-adapter.zip
+
+# longclip
+wget -O resources/models/LongCLIP-L.zip "https://drive.google.com/uc?export=download&id=1-DDPcbAbmGZJPHsdl1PgFMVtxmOnUtc7"
+unzip resources/models/LongCLIP-L.zip
+
+# sdv1.5-base-models(you can also donwload from civitai.com)
+wget -O resources/models/sd_v1-5_base_models.zip "https://drive.google.com/uc?export=download&id=1pxrAVT8OQKyyyW2WgImqEQrectbIpkBH"
+unzip resources/models/sd_v1-5_base_models.zip
+
+# stable-diffusion-v1-5
+git lfs install
+git clone https://huggingface.co/runwayml/stable-diffusion-v1-5 resources/models
+```
+After download models, your resources folder is like:
+```
+📦 resouces/
+├── 📂 models/
+│   └── 📂 fancyvideo_ckpts/
+│   └── 📂 CV-VAE/
+│   └── 📂 res-adapter/
+│   └── 📂 LongCLIP-L/
+│   └── 📂 sd_v1-5_base_models/
+│   └── 📂 stable-diffusion-v1-5/
+```
+
+### 3. Customize your own videos
+#### 3.1 Image to Video
+Due to the limited image generation capabilities of the SD1.5 model, we recommend generating the initial frame using a more advanced T2I model, such as SDXL, and then using our model's I2V capabilities to create the video.
+```
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=./ python scripts/demo.py --config configs/inference/i2v.yaml
+```
+#### 3.2 Text to Video with different base models
+Our model features universal T2V capabilities and can be customized with the SD1.5 community base model.
+```
+# use the base model of pixars
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=./ python scripts/demo.py --config configs/inference/t2v_pixars.yaml
+
+# use the base model of realcartoon3d
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=./ python scripts/demo.py --config configs/inference/t2v_realcartoon3d.yaml
+
+# use the base model of toonyou
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=./ python scripts/demo.py --config configs/inference/t2v_toonyou.yaml
+```
+
+
+## Reference
+- Animatediff: https://github.com/guoyww/AnimateDiff
+- CV-VAE: https://github.com/AILab-CVC/CV-VAE
+- Animatediff: https://github.com/bytedance/res-adapter
+- Animatediff: https://github.com/beichenzbc/Long-CLIP
+
+
+## We Are Hiring
+We are seeking academic interns in the AIGC field. If interested, please send your resume to [[email protected]](mailto:[email protected]).
+
+
+## BibTeX
+```
+@misc{feng2024fancyvideodynamicconsistentvideo,
+        title={FancyVideo: Towards Dynamic and Consistent Video Generation via Cross-frame Textual Guidance}, 
+        author={Jiasong Feng and Ao Ma and Jing Wang and Bo Cheng and Xiaodan Liang and Dawei Leng and Yuhui Yin},
+        year={2024},
+        eprint={2408.08189},
+        archivePrefix={arXiv},
+        primaryClass={cs.CV},
+        url={https://arxiv.org/abs/2408.08189}, 
+}
+```
+
+
+## License
+This project is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE).
@@ -0,0 +1,26 @@
+model:
+  base_model_type: "realisticVisionV60B1_v51VAE"
+  model_path: "resources/models"
+  text_to_video_mm_path: "resources/models/fancyvideo_ckpts/vae_3d_61_frames/mp_rank_00_model_states.pt"
+  base_model_path: "resources/models/sd_v1-5_base_models/realisticVisionV60B1_v51VAE.safetensors"
+  res_adapter_type: "res_adapter_v2"
+  trained_keys: ["motion_modules.", "conv_in.weight", "fps_embedding.", "motion_embedding."]
+  vae_type: "vae_3d"
+  use_fps_embedding: true
+  use_motion_embedding: true
+  common_positive_prompt: "Best quality, masterpiece, ultra high res, photorealistic, Ultra realistic illustration, hyperrealistic, 8k"
+  common_negative_prompt: "(low quality:1.3), (worst quality:1.3),poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face,Facial blurring,a large crowd, many people,advertising, information, news, watermark, text, username, signature,out of frame, low res, error, cropped, worst quality, low quality, artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, nsfw, breast, naked, eroticism"
+
+inference:
+  infer_mode: "i2v"
+  resolution: [768, 768]
+  video_length: 16
+  output_fps: 25
+  cond_fps: 25
+  cond_motion_score: 3.0
+  use_noise_scheduler_snr: true
+  seed: 22
+  prompt_path: "resources/demos/test_prompts/test_i2v_prompt.txt"
+  reference_image_folder: "resources/demos/reference_images/768x768"
+  output_folder: "resources/demos/samples/i2v/realisticVisionV60B1_v51VAE/768x768"
+
@@ -0,0 +1,26 @@
+model:
+  base_model_type: "pixarsRendermanInspo_mk1"
+  model_path: "resources/models"
+  text_to_video_mm_path: "resources/models/fancyvideo_ckpts/vae_3d_61_frames/mp_rank_00_model_states.pt"
+  base_model_path: "resources/models/sd_v1-5_base_models/pixarsRendermanInspo_mk1.safetensors"
+  res_adapter_type: "res_adapter_v2"
+  trained_keys: ["motion_modules.", "conv_in.weight", "fps_embedding.", "motion_embedding."]
+  vae_type: "vae_3d"
+  use_fps_embedding: true
+  use_motion_embedding: true
+  common_positive_prompt: "Best quality, masterpiece, ultra high res, photorealistic, Ultra realistic illustration, hyperrealistic, 8k"
+  common_negative_prompt: "(low quality:1.3), (worst quality:1.3),poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face,Facial blurring,a large crowd, many people,advertising, information, news, watermark, text, username, signature,out of frame, low res, error, cropped, worst quality, low quality, artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, nsfw, breast, naked, eroticism"
+
+inference:
+  infer_mode: "t2v"
+  resolution: [768, 768]
+  video_length: 16
+  output_fps: 25
+  cond_fps: 25
+  cond_motion_score: 3.0
+  use_noise_scheduler_snr: true
+  seed: 22
+  prompt_path: "resources/demos/test_prompts/pixarsRendermanInspo_mk1.txt"
+  reference_image_folder: "resources/demos/reference_images/768x768"
+  output_folder: "resources/demos/samples/t2v/pixarsRendermanInspo_mk1/768x768"
+
@@ -0,0 +1,26 @@
+model:
+  base_model_type: "realcartoon3d_v15"
+  model_path: "resources/models"
+  text_to_video_mm_path: "resources/models/fancyvideo_ckpts/vae_3d_61_frames/mp_rank_00_model_states.pt"
+  base_model_path: "resources/models/sd_v1-5_base_models/realcartoon3d_v15.safetensors"
+  res_adapter_type: "res_adapter_v2"
+  trained_keys: ["motion_modules.", "conv_in.weight", "fps_embedding.", "motion_embedding."]
+  vae_type: "vae_3d"
+  use_fps_embedding: true
+  use_motion_embedding: true
+  common_positive_prompt: "Best quality, masterpiece, ultra high res, photorealistic, Ultra realistic illustration, hyperrealistic, 8k"
+  common_negative_prompt: "(low quality:1.3), (worst quality:1.3),poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face,Facial blurring,a large crowd, many people,advertising, information, news, watermark, text, username, signature,out of frame, low res, error, cropped, worst quality, low quality, artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, nsfw, breast, naked, eroticism"
+
+inference:
+  infer_mode: "t2v"
+  resolution: [768, 768]
+  video_length: 16
+  output_fps: 25
+  cond_fps: 25
+  cond_motion_score: 3.0
+  use_noise_scheduler_snr: true
+  seed: 22
+  prompt_path: "resources/demos/test_prompts/realcartoon3d_v15.txt"
+  reference_image_folder: "resources/demos/reference_images/768x768"
+  output_folder: "resources/demos/samples/t2v/realcartoon3d_v15/768x768"
+
@@ -0,0 +1,26 @@
+model:
+  base_model_type: "toonyou_beta3"
+  model_path: "resources/models"
+  text_to_video_mm_path: "resources/models/fancyvideo_ckpts/vae_3d_61_frames/mp_rank_00_model_states.pt"
+  base_model_path: "resources/models/sd_v1-5_base_models/toonyou_beta3.safetensors"
+  res_adapter_type: "res_adapter_v2"
+  trained_keys: ["motion_modules.", "conv_in.weight", "fps_embedding.", "motion_embedding."]
+  vae_type: "vae_3d"
+  use_fps_embedding: true
+  use_motion_embedding: true
+  common_positive_prompt: "Best quality, masterpiece, ultra high res, photorealistic, Ultra realistic illustration, hyperrealistic, 8k"
+  common_negative_prompt: "(low quality:1.3), (worst quality:1.3),poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face,Facial blurring,a large crowd, many people,advertising, information, news, watermark, text, username, signature,out of frame, low res, error, cropped, worst quality, low quality, artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, nsfw, breast, naked, eroticism"
+
+inference:
+  infer_mode: "t2v"
+  resolution: [768, 768]
+  video_length: 16
+  output_fps: 25
+  cond_fps: 25
+  cond_motion_score: 3.0
+  use_noise_scheduler_snr: true
+  seed: 22
+  prompt_path: "resources/demos/test_prompts/toonyou_beta3.txt"
+  reference_image_folder: "resources/demos/reference_images/768x768"
+  output_folder: "resources/demos/samples/t2v/toonyou_beta3/768x768"
+