From 479c2a6ad5735138130abc40c2fb283372992817 Mon Sep 17 00:00:00 2001 From: jaminmc <1310376+jaminmc@users.noreply.github.com> Date: Sun, 14 Sep 2025 20:50:12 -0400 Subject: [PATCH 1/2] Add multi-language interface support and gitignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add comprehensive multi-language support (Chinese, English, Spanish, German, Japanese, French, Portuguese, Russian) - Add language detection and localization utilities in wan/utils/language_utils.py - Update gradio interface with dynamic language switching - Add standard Python .gitignore file - Replace hardcoded 'path/StableAvatar' with dynamic SCRIPT_DIR variable - Use $(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) to get script location - Update all path references in inference.sh and multiple_gpu_inference.sh - Makes scripts portable and work regardless of installation location Optimize requirements.txt installation order for efficiency - Reorganize packages by dependency hierarchy and installation efficiency - Group packages logically: build tools → PyTorch → computer vision → ML frameworks → audio/video → utilities - Install large dependencies (PyTorch ecosystem) first to avoid conflicts - Install core numerical libraries (numpy) early as many packages depend on them - Add clear section comments explaining the installation strategy - Remove tokenizers (automatically installed as dependency of transformers) - Remove imageio-ffmpeg (automatically installed as dependency of imageio[ffmpeg]) - Streamline requirements.txt by removing packages that are automatically installed - Add documentation for optional packages that enhance performance/features Add comprehensive optional packages section to README - Add new 'Optional Packages for Enhanced Performance' section - Document all optional packages: flash-attn, xformers, bitsandbytes, audio-separator[gpu], decord - Include clear descriptions of what each package provides - Add note about automatic fallbacks when packages are not installed - Update existing flash_attn references for consistency - Help users understand which optional packages to install for their use case --- .gitignore | 131 ++++ README.md | 44 +- app.py | 240 +++--- audio_extractor.py | 1 + inference.py | 1 + inference.sh | 16 +- lip_mask_extractor.py | 1 + multiple_gpu_inference.sh | 16 +- requirements.txt | 83 +- train_14B.py | 1 + train_14B.sh | 1 + train_14B_lora.py | 1 + train_14B_lora.sh | 1 + train_1B_rec_vec.py | 1 + train_1B_rec_vec.sh | 1 + train_1B_rec_vec_64.sh | 1 + train_1B_rec_vec_lora.py | 1 + train_1B_rec_vec_lora.sh | 1 + train_1B_rec_vec_lora_64.sh | 1 + train_1B_square.py | 1 + train_1B_square.sh | 1 + train_1B_square_64.sh | 1 + vocal_seperator.py | 1 + wan/__init__.py | 1 + wan/configs/__init__.py | 1 + wan/configs/shared_config.py | 1 + wan/configs/wan_i2v_14B.py | 1 + wan/configs/wan_t2v_14B.py | 1 + wan/configs/wan_t2v_1_3B.py | 1 + wan/dataset/talking_video_dataset_fantasy.py | 1 + wan/dist/__init__.py | 1 + wan/dist/wan_xfuser.py | 1 + wan/distributed/__init__.py | 0 wan/distributed/fsdp.py | 1 + wan/distributed/xdit_context_parallel.py | 1 + wan/image2video.py | 1 + wan/models/__init__.py | 0 wan/models/attention_processor.py | 1 + wan/models/cache_utils.py | 1 + wan/models/motion_controller.py | 1 + wan/models/motion_to_bucket.py | 1 + wan/models/vocal_projector_fantasy.py | 1 + wan/models/vocal_projector_fantasy_14B.py | 1 + wan/models/vocal_projector_fantasy_1B.py | 1 + wan/models/wan_fantasy_transformer3d_14B.py | 1 + wan/models/wan_fantasy_transformer3d_1B.py | 1 + wan/models/wan_image_encoder.py | 1 + wan/models/wan_text_encoder.py | 1 + wan/models/wan_transformer3d.py | 1 + wan/models/wan_vae.py | 1 + wan/models/wan_xlm_roberta.py | 1 + wan/models/wav2vec.py | 1 + wan/pipeline/__init__.py | 0 wan/pipeline/pipeline_wan_fun_inpaint.py | 1 + wan/pipeline/wan_inference_long_pipeline.py | 1 + .../wan_inference_pipeline_fantasy.py | 1 + wan/text2video.py | 1 + wan/utils/__init__.py | 1 + wan/utils/color_correction.py | 1 + wan/utils/discrete_sampler.py | 1 + wan/utils/fm_solvers.py | 1 + wan/utils/fm_solvers_unipc.py | 1 + wan/utils/fp8_optimization.py | 1 + wan/utils/language_utils.py | 724 ++++++++++++++++++ wan/utils/lora_utils.py | 1 + wan/utils/prompt_extend.py | 1 + wan/utils/qwen_vl_utils.py | 1 + wan/utils/utils.py | 1 + 68 files changed, 1151 insertions(+), 161 deletions(-) create mode 100644 .gitignore mode change 100644 => 100755 app.py mode change 100644 => 100755 audio_extractor.py mode change 100644 => 100755 inference.py mode change 100644 => 100755 inference.sh mode change 100644 => 100755 lip_mask_extractor.py mode change 100644 => 100755 multiple_gpu_inference.sh mode change 100644 => 100755 train_14B.py mode change 100644 => 100755 train_14B.sh mode change 100644 => 100755 train_14B_lora.py mode change 100644 => 100755 train_14B_lora.sh mode change 100644 => 100755 train_1B_rec_vec.py mode change 100644 => 100755 train_1B_rec_vec.sh mode change 100644 => 100755 train_1B_rec_vec_64.sh mode change 100644 => 100755 train_1B_rec_vec_lora.py mode change 100644 => 100755 train_1B_rec_vec_lora.sh mode change 100644 => 100755 train_1B_rec_vec_lora_64.sh mode change 100644 => 100755 train_1B_square.py mode change 100644 => 100755 train_1B_square.sh mode change 100644 => 100755 train_1B_square_64.sh mode change 100644 => 100755 vocal_seperator.py mode change 100644 => 100755 wan/__init__.py mode change 100644 => 100755 wan/configs/__init__.py mode change 100644 => 100755 wan/configs/shared_config.py mode change 100644 => 100755 wan/configs/wan_i2v_14B.py mode change 100644 => 100755 wan/configs/wan_t2v_14B.py mode change 100644 => 100755 wan/configs/wan_t2v_1_3B.py mode change 100644 => 100755 wan/dataset/talking_video_dataset_fantasy.py mode change 100644 => 100755 wan/dist/__init__.py mode change 100644 => 100755 wan/dist/wan_xfuser.py mode change 100644 => 100755 wan/distributed/__init__.py mode change 100644 => 100755 wan/distributed/fsdp.py mode change 100644 => 100755 wan/distributed/xdit_context_parallel.py mode change 100644 => 100755 wan/image2video.py mode change 100644 => 100755 wan/models/__init__.py mode change 100644 => 100755 wan/models/attention_processor.py mode change 100644 => 100755 wan/models/cache_utils.py mode change 100644 => 100755 wan/models/motion_controller.py mode change 100644 => 100755 wan/models/motion_to_bucket.py mode change 100644 => 100755 wan/models/vocal_projector_fantasy.py mode change 100644 => 100755 wan/models/vocal_projector_fantasy_14B.py mode change 100644 => 100755 wan/models/vocal_projector_fantasy_1B.py mode change 100644 => 100755 wan/models/wan_fantasy_transformer3d_14B.py mode change 100644 => 100755 wan/models/wan_fantasy_transformer3d_1B.py mode change 100644 => 100755 wan/models/wan_image_encoder.py mode change 100644 => 100755 wan/models/wan_text_encoder.py mode change 100644 => 100755 wan/models/wan_transformer3d.py mode change 100644 => 100755 wan/models/wan_vae.py mode change 100644 => 100755 wan/models/wan_xlm_roberta.py mode change 100644 => 100755 wan/models/wav2vec.py mode change 100644 => 100755 wan/pipeline/__init__.py mode change 100644 => 100755 wan/pipeline/pipeline_wan_fun_inpaint.py mode change 100644 => 100755 wan/pipeline/wan_inference_long_pipeline.py mode change 100644 => 100755 wan/pipeline/wan_inference_pipeline_fantasy.py mode change 100644 => 100755 wan/text2video.py mode change 100644 => 100755 wan/utils/__init__.py mode change 100644 => 100755 wan/utils/color_correction.py mode change 100644 => 100755 wan/utils/discrete_sampler.py mode change 100644 => 100755 wan/utils/fm_solvers.py mode change 100644 => 100755 wan/utils/fm_solvers_unipc.py mode change 100644 => 100755 wan/utils/fp8_optimization.py create mode 100755 wan/utils/language_utils.py mode change 100644 => 100755 wan/utils/lora_utils.py mode change 100644 => 100755 wan/utils/prompt_extend.py mode change 100644 => 100755 wan/utils/qwen_vl_utils.py mode change 100644 => 100755 wan/utils/utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a7797d5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,131 @@ +# Checkpoints and model files +checkpoints/ +checkpoints + +# Generated outputs +outputs/ +requirements.txt + +# Virtual environments +venv/ +.venv/ +env/ +.env/ + +# Python cache files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db diff --git a/README.md b/README.md index 0ce50ca..00eeac2 100644 --- a/README.md +++ b/README.md @@ -96,22 +96,50 @@ For the basic version of the model checkpoint (Wan2.1-1.3B-based), it supports g ### 🧱 Environment setup -``` +Choose the appropriate setup based on your hardware: + +#### CUDA 12.4 (RTX 40xx series and earlier) +```bash pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu124 pip install -r requirements.txt -# Optional to install flash_attn to accelerate attention computation -pip install flash_attn +# Optional: install flash-attn for faster attention computation (NVIDIA only) +pip install flash-attn ``` -### 🧱 Environment setup for Blackwell series chips - -``` +#### CUDA 12.8 (Blackwell series chips - RTX 50xx, H200, etc.) +```bash pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128 pip install -r requirements.txt -# Optional to install flash_attn to accelerate attention computation -pip install flash_attn +# Optional: install flash-attn for faster attention computation (NVIDIA only) +pip install flash-attn +``` + +#### CPU-only (macOS, Linux without GPU, or for testing) +```bash +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +pip install -r requirements.txt +``` + +### 🚀 Optional Packages for Enhanced Performance + +For better performance and additional features, you can install these optional packages: + +```bash +# Memory efficient attention (alternative to flash-attn, works on more hardware) +pip install xformers + +# 8-bit training optimization (for LoRA training) +pip install bitsandbytes + +# Vocal separation functionality +pip install audio-separator[gpu] + +# Faster video reading (not available on macOS, falls back to torchvision automatically) +pip install decord ``` +**Note**: All these packages are optional. The system will automatically fall back to standard implementations if they're not installed. Install only the packages you need for your specific use case. + ### 🧱 Download weights If you encounter connection issues with Hugging Face, you can utilize the mirror endpoint by setting the environment variable: `export HF_ENDPOINT=https://hf-mirror.com`. Please download weights manually as follows: diff --git a/app.py b/app.py old mode 100644 new mode 100755 index 7d84bd4..bb2fcc8 --- a/app.py +++ b/app.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import torch import psutil import argparse @@ -15,6 +16,10 @@ from wan.pipeline.wan_inference_long_pipeline import WanI2VTalkingInferenceLongPipeline from wan.utils.fp8_optimization import replace_parameters_by_name, convert_weight_dtype_wrapper, convert_model_weight_to_float8 from wan.utils.utils import get_image_to_video_latent, save_videos_grid +from wan.utils.language_utils import ( + detect_browser_language, get_interface_texts, get_display_language, + get_language_choices, create_language_detection_js +) import numpy as np import librosa import datetime @@ -277,162 +282,165 @@ def vocal_separation(audio_path): return f"outputs/{timestamp}.wav", f"Generated outputs/{timestamp}.wav / 已生成outputs/{timestamp}.wav" +def detect_and_set_language(request): + """Detect browser language and return appropriate language setting.""" + try: + # Get Accept-Language header from request + accept_language = request.headers.get('Accept-Language', '') + detected_lang = detect_browser_language(accept_language) + return get_display_language(detected_lang) + except: + return "中文" # Default fallback + + def update_language(language): - if language == "English": - return { - GPU_memory_mode: gr.Dropdown(label="GPU Memory Mode", info="Normal uses 25G VRAM, model_cpu_offload uses 13G VRAM"), - teacache_threshold: gr.Slider(label="TeaCache Threshold", info="Recommended 0.1, 0 disables TeaCache acceleration"), - num_skip_start_steps: gr.Slider(label="Skip Start Steps", info="Recommended 5"), - clip_sample_n_frames: gr.Slider(label="Clip Sample Frames", info="Video frames, 81=2s@25fps, 161=4s@25fps, must be 4n+1"), - image_path: gr.Image(label="Upload Image"), - audio_path: gr.Audio(label="Upload Audio"), - prompt: gr.Textbox(label="Prompt"), - negative_prompt: gr.Textbox(label="Negative Prompt"), - generate_button: gr.Button("🎬 Start Generation"), - width: gr.Slider(label="Width"), - height: gr.Slider(label="Height"), - exchange_button: gr.Button("🔄 Swap Width/Height"), - adjust_button: gr.Button("Adjust Size Based on Image"), - guidance_scale: gr.Slider(label="Guidance Scale"), - num_inference_steps: gr.Slider(label="Sampling Steps (Recommended 50)"), - text_guide_scale: gr.Slider(label="Text Guidance Scale"), - audio_guide_scale: gr.Slider(label="Audio Guidance Scale"), - motion_frame: gr.Slider(label="Motion Frame"), - fps: gr.Slider(label="FPS"), - overlap_window_length: gr.Slider(label="Overlap Window Length"), - seed_param: gr.Number(label="Seed (positive integer, -1 for random)"), - info: gr.Textbox(label="Status"), - video_output: gr.Video(label="Generated Result"), - seed_output: gr.Textbox(label="Seed"), - video_path: gr.Video(label="Upload Video"), - extractor_button: gr.Button("🎬 Start Extraction"), - info2: gr.Textbox(label="Status"), - audio_output: gr.Audio(label="Generated Result"), - audio_path3: gr.Audio(label="Upload Audio"), - separation_button: gr.Button("🎬 Start Separation"), - info3: gr.Textbox(label="Status"), - audio_output3: gr.Audio(label="Generated Result") - } - else: - return { - GPU_memory_mode: gr.Dropdown(label="显存模式", info="Normal占用25G显存,model_cpu_offload占用13G显存"), - teacache_threshold: gr.Slider(label="teacache threshold", info="推荐参数0.1,0为禁用teacache加速"), - num_skip_start_steps: gr.Slider(label="跳过开始步数", info="推荐参数5"), - clip_sample_n_frames: gr.Slider(label="Clip采样帧数", info="视频帧数,81=2秒@25fps,161=4秒@25fps,必须为4n+1"), - image_path: gr.Image(label="上传图片"), - audio_path: gr.Audio(label="上传音频"), - prompt: gr.Textbox(label="提示词"), - negative_prompt: gr.Textbox(label="负面提示词"), - generate_button: gr.Button("🎬 开始生成"), - width: gr.Slider(label="宽度"), - height: gr.Slider(label="高度"), - exchange_button: gr.Button("🔄 交换宽高"), - adjust_button: gr.Button("根据图片调整宽高"), - guidance_scale: gr.Slider(label="guidance scale"), - num_inference_steps: gr.Slider(label="采样步数(推荐50步)"), - text_guide_scale: gr.Slider(label="text guidance scale"), - audio_guide_scale: gr.Slider(label="audio guidance scale"), - motion_frame: gr.Slider(label="motion frame"), - fps: gr.Slider(label="帧率"), - overlap_window_length: gr.Slider(label="overlap window length"), - seed_param: gr.Number(label="种子,请输入正整数,-1为随机"), - info: gr.Textbox(label="提示信息"), - video_output: gr.Video(label="生成结果"), - seed_output: gr.Textbox(label="种子"), - video_path: gr.Video(label="上传视频"), - extractor_button: gr.Button("🎬 开始提取"), - info2: gr.Textbox(label="提示信息"), - audio_output: gr.Audio(label="生成结果"), - audio_path3: gr.Audio(label="上传音频"), - separation_button: gr.Button("🎬 开始分离"), - info3: gr.Textbox(label="提示信息"), - audio_output3: gr.Audio(label="生成结果") - } + """Update interface language based on user selection.""" + # The language parameter is actually the language code (second element of tuple) + # So we can use it directly + lang_code = language # language is already the code like 'es', 'de', 'ja', etc. + texts = get_interface_texts(lang_code) + + # Return component updates in the same order as all_components + return [ + gr.Markdown(f""" +
+

{texts['main']['title']}

+
+ """), + gr.Dropdown(label=texts["model_settings"]["gpu_memory_mode"], info=texts["model_settings"]["gpu_memory_info"]), + gr.Slider(label=texts["model_settings"]["teacache_threshold"], info=texts["model_settings"]["teacache_info"]), + gr.Slider(label=texts["model_settings"]["num_skip_start_steps"], info=texts["model_settings"]["skip_steps_info"]), + gr.Slider(label=texts["model_settings"]["clip_sample_n_frames"], info=texts["model_settings"]["clip_frames_info"]), + gr.Image(label=texts["video_generation"]["upload_image"]), + gr.Audio(label=texts["video_generation"]["upload_audio"]), + gr.Textbox(label=texts["video_generation"]["prompt"]), + gr.Textbox(label=texts["video_generation"]["negative_prompt"], value=texts["video_generation"]["negative_prompt_default"]), + gr.Button(texts["video_generation"]["start_generation"]), + gr.Slider(label=texts["video_generation"]["width"]), + gr.Slider(label=texts["video_generation"]["height"]), + gr.Button(texts["video_generation"]["swap_dimensions"]), + gr.Button(texts["video_generation"]["adjust_size"]), + gr.Slider(label=texts["video_generation"]["guidance_scale"]), + gr.Slider(label=texts["video_generation"]["sampling_steps"]), + gr.Slider(label=texts["video_generation"]["text_guide_scale"]), + gr.Slider(label=texts["video_generation"]["audio_guide_scale"]), + gr.Slider(label=texts["video_generation"]["motion_frame"]), + gr.Slider(label=texts["video_generation"]["fps"]), + gr.Slider(label=texts["video_generation"]["overlap_window_length"]), + gr.Number(label=texts["video_generation"]["seed"]), + gr.Textbox(label=texts["video_generation"]["status"]), + gr.Video(label=texts["video_generation"]["generated_result"]), + gr.Textbox(label=texts["video_generation"]["seed_output"]), + gr.Video(label=texts["audio_extraction"]["upload_video"]), + gr.Button(texts["audio_extraction"]["start_extraction"]), + gr.Textbox(label=texts["audio_extraction"]["status"]), + gr.Audio(label=texts["audio_extraction"]["generated_result"]), + gr.Audio(label=texts["vocal_separation"]["upload_audio"]), + gr.Button(texts["vocal_separation"]["start_separation"]), + gr.Textbox(label=texts["vocal_separation"]["status"]), + gr.Audio(label=texts["vocal_separation"]["generated_result"]) + ] + +# Get initial language texts (default to English) +initial_texts = get_interface_texts("en") with gr.Blocks(theme=gr.themes.Base()) as demo: - gr.Markdown(""" + # Create dynamic device info component that updates with language + device_info_display = gr.Markdown(f"""
-

StableAvatar

+

{initial_texts['main']['title']}

""") + # Set English as the default language (use language code since Radio uses codes as values) + default_language = "en" + language_radio = gr.Radio( - choices=["English", "中文"], - value="中文", - label="Language / 语言" + choices=get_language_choices(), + value=default_language, + label=initial_texts['main']['language_label'] ) - with gr.Accordion("Model Settings / 模型设置", open=False): + with gr.Accordion(initial_texts['main']['model_settings'], open=False): with gr.Row(): GPU_memory_mode = gr.Dropdown( - label = "显存模式", - info = "Normal占用25G显存,model_cpu_offload占用13G显存", + label = initial_texts['model_settings']['gpu_memory_mode'], + info = initial_texts['model_settings']['gpu_memory_info'], choices = ["Normal", "model_cpu_offload", "model_cpu_offload_and_qfloat8", "sequential_cpu_offload"], value = "Normal" ) - teacache_threshold = gr.Slider(label="teacache threshold", info = "推荐参数0.1,0为禁用teacache加速", minimum=0, maximum=1, step=0.01, value=0) - num_skip_start_steps = gr.Slider(label="跳过开始步数", info = "推荐参数5", minimum=0, maximum=100, step=1, value=5) + teacache_threshold = gr.Slider( + label=initial_texts['model_settings']['teacache_threshold'], + info=initial_texts['model_settings']['teacache_info'], + minimum=0, maximum=1, step=0.01, value=0 + ) + num_skip_start_steps = gr.Slider( + label=initial_texts['model_settings']['num_skip_start_steps'], + info=initial_texts['model_settings']['skip_steps_info'], + minimum=0, maximum=100, step=1, value=5 + ) with gr.Row(): clip_sample_n_frames = gr.Slider( - label="Clip Sample Frames", - info="视频帧数,81=2秒@25fps,161=4秒@25fps,必须为4n+1", + label=initial_texts['model_settings']['clip_sample_n_frames'], + info=initial_texts['model_settings']['clip_frames_info'], minimum=41, maximum=321, step=4, value=81 ) - with gr.TabItem("StableAvatar"): + with gr.TabItem(initial_texts['main']['video_generation']): with gr.Row(): with gr.Column(): with gr.Row(): - image_path = gr.Image(label="上传图片", type="filepath", height=280) - audio_path = gr.Audio(label="上传音频", type="filepath") - prompt = gr.Textbox(label="提示词", value="") - negative_prompt = gr.Textbox(label="负面提示词", value="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走") - generate_button = gr.Button("🎬 开始生成", variant='primary') - with gr.Accordion("Parameter Settings / 参数设置", open=True): + image_path = gr.Image(label=initial_texts['video_generation']['upload_image'], type="filepath", height=280) + audio_path = gr.Audio(label=initial_texts['video_generation']['upload_audio'], type="filepath") + prompt = gr.Textbox(label=initial_texts['video_generation']['prompt'], value="") + negative_prompt = gr.Textbox(label=initial_texts['video_generation']['negative_prompt'], value=initial_texts['video_generation']['negative_prompt_default']) + generate_button = gr.Button(initial_texts['video_generation']['start_generation'], variant='primary') + with gr.Accordion(initial_texts['main']['model_settings'], open=True): with gr.Row(): - width = gr.Slider(label="宽度", minimum=256, maximum=2048, step=16, value=512) - height = gr.Slider(label="高度", minimum=256, maximum=2048, step=16, value=512) + width = gr.Slider(label=initial_texts['video_generation']['width'], minimum=256, maximum=2048, step=16, value=512) + height = gr.Slider(label=initial_texts['video_generation']['height'], minimum=256, maximum=2048, step=16, value=512) with gr.Row(): - exchange_button = gr.Button("🔄 交换宽高") - adjust_button = gr.Button("根据图片调整宽高") + exchange_button = gr.Button(initial_texts['video_generation']['swap_dimensions']) + adjust_button = gr.Button(initial_texts['video_generation']['adjust_size']) with gr.Row(): - guidance_scale = gr.Slider(label="guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=6.0) - num_inference_steps = gr.Slider(label="采样步数(推荐50步)", minimum=1, maximum=100, step=1, value=10) + guidance_scale = gr.Slider(label=initial_texts['video_generation']['guidance_scale'], minimum=1.0, maximum=10.0, step=0.1, value=6.0) + num_inference_steps = gr.Slider(label=initial_texts['video_generation']['sampling_steps'], minimum=1, maximum=100, step=1, value=10) with gr.Row(): - text_guide_scale = gr.Slider(label="text guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=3.0) - audio_guide_scale = gr.Slider(label="audio guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=5.0) + text_guide_scale = gr.Slider(label=initial_texts['video_generation']['text_guide_scale'], minimum=1.0, maximum=10.0, step=0.1, value=3.0) + audio_guide_scale = gr.Slider(label=initial_texts['video_generation']['audio_guide_scale'], minimum=1.0, maximum=10.0, step=0.1, value=5.0) with gr.Row(): - motion_frame = gr.Slider(label="motion frame", minimum=1, maximum=50, step=1, value=25) - fps = gr.Slider(label="帧率", minimum=1, maximum=60, step=1, value=25) + motion_frame = gr.Slider(label=initial_texts['video_generation']['motion_frame'], minimum=1, maximum=50, step=1, value=25) + fps = gr.Slider(label=initial_texts['video_generation']['fps'], minimum=1, maximum=60, step=1, value=25) with gr.Row(): - overlap_window_length = gr.Slider(label="overlap window length", minimum=1, maximum=20, step=1, value=5) - seed_param = gr.Number(label="种子,请输入正整数,-1为随机", value=-1) + overlap_window_length = gr.Slider(label=initial_texts['video_generation']['overlap_window_length'], minimum=1, maximum=20, step=1, value=5) + seed_param = gr.Number(label=initial_texts['video_generation']['seed'], value=-1) with gr.Column(): - info = gr.Textbox(label="提示信息", interactive=False) - video_output = gr.Video(label="生成结果", interactive=False) - seed_output = gr.Textbox(label="种子") - with gr.TabItem("Audio Extraction / 音频提取"): + info = gr.Textbox(label=initial_texts['video_generation']['status'], interactive=False) + video_output = gr.Video(label=initial_texts['video_generation']['generated_result'], interactive=False) + seed_output = gr.Textbox(label=initial_texts['video_generation']['seed_output']) + with gr.TabItem(initial_texts['main']['audio_extraction']): with gr.Row(): with gr.Column(): - video_path = gr.Video(label="上传视频", height=500) - extractor_button = gr.Button("🎬 开始提取", variant='primary') + video_path = gr.Video(label=initial_texts['audio_extraction']['upload_video'], height=500) + extractor_button = gr.Button(initial_texts['audio_extraction']['start_extraction'], variant='primary') with gr.Column(): - info2 = gr.Textbox(label="提示信息", interactive=False) - audio_output = gr.Audio(label="生成结果", interactive=False) - with gr.TabItem("Vocal Separation / 人声分离"): + info2 = gr.Textbox(label=initial_texts['audio_extraction']['status'], interactive=False) + audio_output = gr.Audio(label=initial_texts['audio_extraction']['generated_result'], interactive=False) + with gr.TabItem(initial_texts['main']['vocal_separation']): with gr.Row(): with gr.Column(): - audio_path3 = gr.Audio(label="上传音频", type="filepath") - separation_button = gr.Button("🎬 开始分离", variant='primary') + audio_path3 = gr.Audio(label=initial_texts['vocal_separation']['upload_audio'], type="filepath") + separation_button = gr.Button(initial_texts['vocal_separation']['start_separation'], variant='primary') with gr.Column(): - info3 = gr.Textbox(label="提示信息", interactive=False) - audio_output3 = gr.Audio(label="生成结果", interactive=False) + info3 = gr.Textbox(label=initial_texts['vocal_separation']['status'], interactive=False) + audio_output3 = gr.Audio(label=initial_texts['vocal_separation']['generated_result'], interactive=False) - all_components = [GPU_memory_mode, teacache_threshold, num_skip_start_steps, clip_sample_n_frames, image_path, audio_path, prompt, negative_prompt, generate_button, width, height, exchange_button, adjust_button, guidance_scale, num_inference_steps, text_guide_scale, audio_guide_scale, motion_frame, fps, overlap_window_length, seed_param, info, video_output, seed_output, video_path, extractor_button, info2, audio_output, audio_path3, separation_button, info3, audio_output3] + all_components = [device_info_display, GPU_memory_mode, teacache_threshold, num_skip_start_steps, clip_sample_n_frames, image_path, audio_path, prompt, negative_prompt, generate_button, width, height, exchange_button, adjust_button, guidance_scale, num_inference_steps, text_guide_scale, audio_guide_scale, motion_frame, fps, overlap_window_length, seed_param, info, video_output, seed_output, video_path, extractor_button, info2, audio_output, audio_path3, separation_button, info3, audio_output3] + # Use the full update_language function to translate everything language_radio.change( fn=update_language, inputs=[language_radio], @@ -493,4 +501,4 @@ def update_language(language): share=args.share, mcp_server=args.mcp_server, inbrowser=True, - ) + ) \ No newline at end of file diff --git a/audio_extractor.py b/audio_extractor.py old mode 100644 new mode 100755 index 1ba2061..7fd0305 --- a/audio_extractor.py +++ b/audio_extractor.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import os from moviepy.editor import VideoFileClip import argparse diff --git a/inference.py b/inference.py old mode 100644 new mode 100755 index 64cb461..4bfd7d3 --- a/inference.py +++ b/inference.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import gc import logging diff --git a/inference.sh b/inference.sh old mode 100644 new mode 100755 index 55ddae1..8c8e43c --- a/inference.sh +++ b/inference.sh @@ -1,14 +1,18 @@ +#!/bin/bash +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + export TOKENIZERS_PARALLELISM=false -export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP" +export MODEL_NAME="$SCRIPT_DIR/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP" CUDA_VISIBLE_DEVICES=0 python inference.py \ --config_path="deepspeed_config/wan2.1/wan_civitai.yaml" \ --pretrained_model_name_or_path=$MODEL_NAME \ - --transformer_path="path/StableAvatar/checkpoints/StableAvatar-1.3B/transformer3d-square.pt" \ - --pretrained_wav2vec_path="path/StableAvatar/checkpoints/wav2vec2-base-960h" \ - --validation_reference_path="path/StableAvatar/examples/case-1/reference.png" \ - --validation_driven_audio_path="path/StableAvatar/examples/case-1/audio.wav" \ - --output_dir="path/StableAvatar/output_infer" \ + --transformer_path="$SCRIPT_DIR/checkpoints/StableAvatar-1.3B/transformer3d-square.pt" \ + --pretrained_wav2vec_path="$SCRIPT_DIR/checkpoints/wav2vec2-base-960h" \ + --validation_reference_path="$SCRIPT_DIR/examples/case-1/reference.png" \ + --validation_driven_audio_path="$SCRIPT_DIR/examples/case-1/audio.wav" \ + --output_dir="$SCRIPT_DIR/output_infer" \ --validation_prompts="A middle-aged woman with short light brown hair, wearing pearl earrings and a blue blazer, is speaking passionately in front of a blurred background resembling a government building. Her mouth is open mid-phrase, her expression is engaged and energetic, and the lighting is bright and even, suggesting a television interview or live broadcast. The scene gives the impression she is singing with conviction and purpose." \ --seed=42 \ --ulysses_degree=1 \ diff --git a/lip_mask_extractor.py b/lip_mask_extractor.py old mode 100644 new mode 100755 index a35306c..fac99c3 --- a/lip_mask_extractor.py +++ b/lip_mask_extractor.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import os diff --git a/multiple_gpu_inference.sh b/multiple_gpu_inference.sh old mode 100644 new mode 100755 index 8d12fdd..c3446df --- a/multiple_gpu_inference.sh +++ b/multiple_gpu_inference.sh @@ -1,5 +1,9 @@ +#!/bin/bash +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + export TOKENIZERS_PARALLELISM=false -export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP" +export MODEL_NAME="$SCRIPT_DIR/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP" export WORLD_SIZE=4 export MASTER_ADDR="localhost" export MASTER_PORT=29500 @@ -7,11 +11,11 @@ export MASTER_PORT=29500 torchrun --nproc_per_node=4 --nnodes=1 --node_rank=0 --master_addr="localhost" --master_port=29500 inference.py \ --config_path="deepspeed_config/wan2.1/wan_civitai.yaml" \ --pretrained_model_name_or_path=$MODEL_NAME \ - --transformer_path="path/StableAvatar/checkpoints/StableAvatar-1.3B/transformer3d-square.pt" \ - --pretrained_wav2vec_path="path/StableAvatar/checkpoints/wav2vec2-base-960h" \ - --validation_reference_path="path/StableAvatar/examples/case-1/reference.png" \ - --validation_driven_audio_path="path/StableAvatar/examples/case-1/audio.wav" \ - --output_dir="path/StableAvatar/output_infer" \ + --transformer_path="$SCRIPT_DIR/checkpoints/StableAvatar-1.3B/transformer3d-square.pt" \ + --pretrained_wav2vec_path="$SCRIPT_DIR/checkpoints/wav2vec2-base-960h" \ + --validation_reference_path="$SCRIPT_DIR/examples/case-1/reference.png" \ + --validation_driven_audio_path="$SCRIPT_DIR/examples/case-1/audio.wav" \ + --output_dir="$SCRIPT_DIR/output_infer" \ --validation_prompts="A middle-aged woman with short light brown hair, wearing pearl earrings and a blue blazer, is speaking passionately in front of a blurred background resembling a government building. Her mouth is open mid-phrase, her expression is engaged and energetic, and the lighting is bright and even, suggesting a television interview or live broadcast. The scene gives the impression she is singing with conviction and purpose." \ --seed=42 \ --ulysses_degree=2 \ diff --git a/requirements.txt b/requirements.txt index 2f69e17..4140b68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,37 +1,70 @@ +# PyTorch installation notes: +# For CUDA 12.4: pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu124 +# For CUDA 12.8: pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128 +# The version ranges below support both configurations +# +# Installation order optimized for efficiency: +# 1. Build tools and core dependencies first +# 2. PyTorch ecosystem (large packages that others depend on) +# 3. Computer vision and image processing +# 4. ML/AI frameworks +# 5. Audio/video processing +# 6. Utilities and other packages +# +# Removed packages that are dependencies of others: +# - tokenizers (dependency of transformers) +# - imageio-ffmpeg (dependency of imageio[ffmpeg]) +# +# Optional packages (for enhanced performance/features): +# - flash-attn (for faster attention computation, install with: pip install flash-attn) +# - xformers (for memory efficient attention, install with: pip install xformers) +# - bitsandbytes (for 8-bit training, install with: pip install bitsandbytes) +# - audio-separator[gpu] (for vocal separation, install with: pip install audio-separator[gpu]) +# - decord (for faster video reading, install with: pip install decord) +# Note: decord is not available on macOS, falls back to torchvision automatically + +# Build tools and core dependencies ninja -Pillow -einops +numpy>=1.23.5,<2 +tqdm +easydict +omegaconf safetensors -timm -tomesd -torch==2.7.0 + +# PyTorch ecosystem (install first as other packages depend on these) +torch>=2.6.0,<=2.7.0 +torchvision>=0.21.0,<=0.22.0 +torchaudio>=2.1.1,<=2.7.0 torchdiffeq torchsde -decord -datasets -torchvision==0.22.0 + +# Computer vision and image processing +Pillow opencv-python>=4.9.0.80 -diffusers==0.30.1 -transformers==4.51.3 -tokenizers>=0.20.3 -accelerate>=1.1.1 -tqdm -easydict -ftfy -dashscope -imageio-ffmpeg -gradio>=5.0.0 -numpy>=1.23.5,<2 scikit-image -opencv-python -omegaconf -SentencePiece albumentations imageio[ffmpeg] imageio[pyav] -tensorboard -beautifulsoup4 + +# ML/AI frameworks and transformers +transformers==4.51.3 +diffusers==0.30.1 +accelerate>=1.1.1 +timm +tomesd +einops ftfy +SentencePiece +dashscope + +# Audio and video processing librosa -torchaudio==2.7.0 moviepy==1.0.3 + +# Data handling and utilities +datasets +tensorboard +beautifulsoup4 + +# Web interface +gradio>=5.0.0 diff --git a/train_14B.py b/train_14B.py old mode 100644 new mode 100755 index 4835a7b..fb18430 --- a/train_14B.py +++ b/train_14B.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import gc import logging diff --git a/train_14B.sh b/train_14B.sh old mode 100644 new mode 100755 index 77dcf60..c4fd6cd --- a/train_14B.sh +++ b/train_14B.sh @@ -1,3 +1,4 @@ +#!/bin/bash export TOKENIZERS_PARALLELISM=false export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-I2V-14B-480P" diff --git a/train_14B_lora.py b/train_14B_lora.py old mode 100644 new mode 100755 index f6ef6bd..a7a60c8 --- a/train_14B_lora.py +++ b/train_14B_lora.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import gc import logging diff --git a/train_14B_lora.sh b/train_14B_lora.sh old mode 100644 new mode 100755 index e3b2700..b10e7b8 --- a/train_14B_lora.sh +++ b/train_14B_lora.sh @@ -1,3 +1,4 @@ +#!/bin/bash export TOKENIZERS_PARALLELISM=false export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-I2V-14B-480P" diff --git a/train_1B_rec_vec.py b/train_1B_rec_vec.py old mode 100644 new mode 100755 index b4ad332..5f7199a --- a/train_1B_rec_vec.py +++ b/train_1B_rec_vec.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import gc import logging diff --git a/train_1B_rec_vec.sh b/train_1B_rec_vec.sh old mode 100644 new mode 100755 index c0a916b..c970199 --- a/train_1B_rec_vec.sh +++ b/train_1B_rec_vec.sh @@ -1,3 +1,4 @@ +#!/bin/bash export TOKENIZERS_PARALLELISM=false export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP" export NCCL_IB_DISABLE=1 diff --git a/train_1B_rec_vec_64.sh b/train_1B_rec_vec_64.sh old mode 100644 new mode 100755 index 9aa709b..3d30898 --- a/train_1B_rec_vec_64.sh +++ b/train_1B_rec_vec_64.sh @@ -1,3 +1,4 @@ +#!/bin/bash export TOKENIZERS_PARALLELISM=false export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP" diff --git a/train_1B_rec_vec_lora.py b/train_1B_rec_vec_lora.py old mode 100644 new mode 100755 index 3dbb92d..b770d49 --- a/train_1B_rec_vec_lora.py +++ b/train_1B_rec_vec_lora.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import gc import logging diff --git a/train_1B_rec_vec_lora.sh b/train_1B_rec_vec_lora.sh old mode 100644 new mode 100755 index 7f368bd..daf1419 --- a/train_1B_rec_vec_lora.sh +++ b/train_1B_rec_vec_lora.sh @@ -1,3 +1,4 @@ +#!/bin/bash export TOKENIZERS_PARALLELISM=false export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP" export NCCL_IB_DISABLE=1 diff --git a/train_1B_rec_vec_lora_64.sh b/train_1B_rec_vec_lora_64.sh old mode 100644 new mode 100755 index 5a9f4d0..6919e49 --- a/train_1B_rec_vec_lora_64.sh +++ b/train_1B_rec_vec_lora_64.sh @@ -1,3 +1,4 @@ +#!/bin/bash export TOKENIZERS_PARALLELISM=false export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP" diff --git a/train_1B_square.py b/train_1B_square.py old mode 100644 new mode 100755 index f5aeb21..b0e5a90 --- a/train_1B_square.py +++ b/train_1B_square.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import gc import logging diff --git a/train_1B_square.sh b/train_1B_square.sh old mode 100644 new mode 100755 index eb7d5a3..ce4bb57 --- a/train_1B_square.sh +++ b/train_1B_square.sh @@ -1,3 +1,4 @@ +#!/bin/bash export TOKENIZERS_PARALLELISM=false export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP" export NCCL_IB_DISABLE=1 diff --git a/train_1B_square_64.sh b/train_1B_square_64.sh old mode 100644 new mode 100755 index 6280cf3..9cd99b4 --- a/train_1B_square_64.sh +++ b/train_1B_square_64.sh @@ -1,3 +1,4 @@ +#!/bin/bash export TOKENIZERS_PARALLELISM=false export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP" diff --git a/vocal_seperator.py b/vocal_seperator.py old mode 100644 new mode 100755 index 3ecdb84..ed1be3d --- a/vocal_seperator.py +++ b/vocal_seperator.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import os import shutil diff --git a/wan/__init__.py b/wan/__init__.py old mode 100644 new mode 100755 index 62b57c6..5db04d7 --- a/wan/__init__.py +++ b/wan/__init__.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # from . import configs, distributed, modules # from .image2video import WanI2V # from .text2video import WanT2V diff --git a/wan/configs/__init__.py b/wan/configs/__init__.py old mode 100644 new mode 100755 index c72d2d0..5d23be8 --- a/wan/configs/__init__.py +++ b/wan/configs/__init__.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import copy import os diff --git a/wan/configs/shared_config.py b/wan/configs/shared_config.py old mode 100644 new mode 100755 index 04a9f45..0a8de04 --- a/wan/configs/shared_config.py +++ b/wan/configs/shared_config.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import torch from easydict import EasyDict diff --git a/wan/configs/wan_i2v_14B.py b/wan/configs/wan_i2v_14B.py old mode 100644 new mode 100755 index 12e8e20..3ff0c23 --- a/wan/configs/wan_i2v_14B.py +++ b/wan/configs/wan_i2v_14B.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import torch from easydict import EasyDict diff --git a/wan/configs/wan_t2v_14B.py b/wan/configs/wan_t2v_14B.py old mode 100644 new mode 100755 index 9d0ee69..25d4206 --- a/wan/configs/wan_t2v_14B.py +++ b/wan/configs/wan_t2v_14B.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. from easydict import EasyDict diff --git a/wan/configs/wan_t2v_1_3B.py b/wan/configs/wan_t2v_1_3B.py old mode 100644 new mode 100755 index ea9502b..68ad709 --- a/wan/configs/wan_t2v_1_3B.py +++ b/wan/configs/wan_t2v_1_3B.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. from easydict import EasyDict diff --git a/wan/dataset/talking_video_dataset_fantasy.py b/wan/dataset/talking_video_dataset_fantasy.py old mode 100644 new mode 100755 index b23b796..8569c9c --- a/wan/dataset/talking_video_dataset_fantasy.py +++ b/wan/dataset/talking_video_dataset_fantasy.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import math import os import random diff --git a/wan/dist/__init__.py b/wan/dist/__init__.py old mode 100644 new mode 100755 index 8da6edf..825b5d4 --- a/wan/dist/__init__.py +++ b/wan/dist/__init__.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import torch import torch.distributed as dist diff --git a/wan/dist/wan_xfuser.py b/wan/dist/wan_xfuser.py old mode 100644 new mode 100755 index 9e02bf8..090f3c2 --- a/wan/dist/wan_xfuser.py +++ b/wan/dist/wan_xfuser.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import torch import torch.amp as amp diff --git a/wan/distributed/__init__.py b/wan/distributed/__init__.py old mode 100644 new mode 100755 diff --git a/wan/distributed/fsdp.py b/wan/distributed/fsdp.py old mode 100644 new mode 100755 index 18ba2f3..524f8d1 --- a/wan/distributed/fsdp.py +++ b/wan/distributed/fsdp.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import gc from functools import partial diff --git a/wan/distributed/xdit_context_parallel.py b/wan/distributed/xdit_context_parallel.py old mode 100644 new mode 100755 index 01936ce..2da7a5e --- a/wan/distributed/xdit_context_parallel.py +++ b/wan/distributed/xdit_context_parallel.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import torch import torch.cuda.amp as amp diff --git a/wan/image2video.py b/wan/image2video.py old mode 100644 new mode 100755 index b375fb9..2f771a6 --- a/wan/image2video.py +++ b/wan/image2video.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import gc import logging diff --git a/wan/models/__init__.py b/wan/models/__init__.py old mode 100644 new mode 100755 diff --git a/wan/models/attention_processor.py b/wan/models/attention_processor.py old mode 100644 new mode 100755 index 23029b1..823c628 --- a/wan/models/attention_processor.py +++ b/wan/models/attention_processor.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import inspect import math from typing import Callable, List, Optional, Tuple, Union diff --git a/wan/models/cache_utils.py b/wan/models/cache_utils.py old mode 100644 new mode 100755 index d55d87f..ee520a1 --- a/wan/models/cache_utils.py +++ b/wan/models/cache_utils.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import numpy as np import torch diff --git a/wan/models/motion_controller.py b/wan/models/motion_controller.py old mode 100644 new mode 100755 index 9529bef..b430a51 --- a/wan/models/motion_controller.py +++ b/wan/models/motion_controller.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import torch import torch.nn as nn diff --git a/wan/models/motion_to_bucket.py b/wan/models/motion_to_bucket.py old mode 100644 new mode 100755 index 8425b45..48e2b1b --- a/wan/models/motion_to_bucket.py +++ b/wan/models/motion_to_bucket.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import torch from diffusers import ModelMixin from einops import rearrange diff --git a/wan/models/vocal_projector_fantasy.py b/wan/models/vocal_projector_fantasy.py old mode 100644 new mode 100755 index 921a0f7..7670c1d --- a/wan/models/vocal_projector_fantasy.py +++ b/wan/models/vocal_projector_fantasy.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import os import torch import torch.nn as nn diff --git a/wan/models/vocal_projector_fantasy_14B.py b/wan/models/vocal_projector_fantasy_14B.py old mode 100644 new mode 100755 index 52eebf4..e8f8c89 --- a/wan/models/vocal_projector_fantasy_14B.py +++ b/wan/models/vocal_projector_fantasy_14B.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import os import warnings diff --git a/wan/models/vocal_projector_fantasy_1B.py b/wan/models/vocal_projector_fantasy_1B.py old mode 100644 new mode 100755 index 9f6eb31..bb0e802 --- a/wan/models/vocal_projector_fantasy_1B.py +++ b/wan/models/vocal_projector_fantasy_1B.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import os import warnings diff --git a/wan/models/wan_fantasy_transformer3d_14B.py b/wan/models/wan_fantasy_transformer3d_14B.py old mode 100644 new mode 100755 index 0eb6511..1935c55 --- a/wan/models/wan_fantasy_transformer3d_14B.py +++ b/wan/models/wan_fantasy_transformer3d_14B.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/model.py # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. diff --git a/wan/models/wan_fantasy_transformer3d_1B.py b/wan/models/wan_fantasy_transformer3d_1B.py old mode 100644 new mode 100755 index 869e5f7..2eb8533 --- a/wan/models/wan_fantasy_transformer3d_1B.py +++ b/wan/models/wan_fantasy_transformer3d_1B.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/model.py # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. diff --git a/wan/models/wan_image_encoder.py b/wan/models/wan_image_encoder.py old mode 100644 new mode 100755 index 950b4cc..9ab8187 --- a/wan/models/wan_image_encoder.py +++ b/wan/models/wan_image_encoder.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Modified from ``https://github.com/openai/CLIP'' and ``https://github.com/mlfoundations/open_clip'' # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import math diff --git a/wan/models/wan_text_encoder.py b/wan/models/wan_text_encoder.py old mode 100644 new mode 100755 index 34a0323..f5b7f8f --- a/wan/models/wan_text_encoder.py +++ b/wan/models/wan_text_encoder.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/t5.py # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import math diff --git a/wan/models/wan_transformer3d.py b/wan/models/wan_transformer3d.py old mode 100644 new mode 100755 index 3507488..31ceb12 --- a/wan/models/wan_transformer3d.py +++ b/wan/models/wan_transformer3d.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/model.py # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. diff --git a/wan/models/wan_vae.py b/wan/models/wan_vae.py old mode 100644 new mode 100755 index 4afb122..5cf8397 --- a/wan/models/wan_vae.py +++ b/wan/models/wan_vae.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/vae.py # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. from typing import Tuple, Union diff --git a/wan/models/wan_xlm_roberta.py b/wan/models/wan_xlm_roberta.py old mode 100644 new mode 100755 index 755baf3..edc7045 --- a/wan/models/wan_xlm_roberta.py +++ b/wan/models/wan_xlm_roberta.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Modified from transformers.models.xlm_roberta.modeling_xlm_roberta # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import torch diff --git a/wan/models/wav2vec.py b/wan/models/wav2vec.py old mode 100644 new mode 100755 index 5c2fad8..b328ad3 --- a/wan/models/wav2vec.py +++ b/wan/models/wav2vec.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ This module defines the Wav2Vec model, which is a pre-trained model for speech recognition and understanding. It inherits from the Wav2Vec2Model class in the transformers library and provides additional functionalities diff --git a/wan/pipeline/__init__.py b/wan/pipeline/__init__.py old mode 100644 new mode 100755 diff --git a/wan/pipeline/pipeline_wan_fun_inpaint.py b/wan/pipeline/pipeline_wan_fun_inpaint.py old mode 100644 new mode 100755 index 70dd114..568254d --- a/wan/pipeline/pipeline_wan_fun_inpaint.py +++ b/wan/pipeline/pipeline_wan_fun_inpaint.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import inspect import math from dataclasses import dataclass diff --git a/wan/pipeline/wan_inference_long_pipeline.py b/wan/pipeline/wan_inference_long_pipeline.py old mode 100644 new mode 100755 index 54c456b..4f7d7b7 --- a/wan/pipeline/wan_inference_long_pipeline.py +++ b/wan/pipeline/wan_inference_long_pipeline.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import inspect import math import random diff --git a/wan/pipeline/wan_inference_pipeline_fantasy.py b/wan/pipeline/wan_inference_pipeline_fantasy.py old mode 100644 new mode 100755 index b89d4c6..859c8b2 --- a/wan/pipeline/wan_inference_pipeline_fantasy.py +++ b/wan/pipeline/wan_inference_pipeline_fantasy.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import inspect import math from dataclasses import dataclass diff --git a/wan/text2video.py b/wan/text2video.py old mode 100644 new mode 100755 index 2400545..d66dbb2 --- a/wan/text2video.py +++ b/wan/text2video.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import gc import logging diff --git a/wan/utils/__init__.py b/wan/utils/__init__.py old mode 100644 new mode 100755 index 6e9a339..820bf01 --- a/wan/utils/__init__.py +++ b/wan/utils/__init__.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from .fm_solvers import (FlowDPMSolverMultistepScheduler, get_sampling_sigmas, retrieve_timesteps) from .fm_solvers_unipc import FlowUniPCMultistepScheduler diff --git a/wan/utils/color_correction.py b/wan/utils/color_correction.py old mode 100644 new mode 100755 index 83bef44..ef44906 --- a/wan/utils/color_correction.py +++ b/wan/utils/color_correction.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import torch import numpy as np from skimage import color diff --git a/wan/utils/discrete_sampler.py b/wan/utils/discrete_sampler.py old mode 100644 new mode 100755 index 47f3557..a281fb1 --- a/wan/utils/discrete_sampler.py +++ b/wan/utils/discrete_sampler.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """Modified from https://github.com/THUDM/CogVideo/blob/3710a612d8760f5cdb1741befeebb65b9e0f2fe0/sat/sgm/modules/diffusionmodules/sigma_sampling.py """ import torch diff --git a/wan/utils/fm_solvers.py b/wan/utils/fm_solvers.py old mode 100644 new mode 100755 index c908969..7a2e07f --- a/wan/utils/fm_solvers.py +++ b/wan/utils/fm_solvers.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py # Convert dpm solver for flow matching # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. diff --git a/wan/utils/fm_solvers_unipc.py b/wan/utils/fm_solvers_unipc.py old mode 100644 new mode 100755 index 57321ba..ea810ba --- a/wan/utils/fm_solvers_unipc.py +++ b/wan/utils/fm_solvers_unipc.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py # Convert unipc for flow matching # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. diff --git a/wan/utils/fp8_optimization.py b/wan/utils/fp8_optimization.py old mode 100644 new mode 100755 index cf77946..99dc72a --- a/wan/utils/fp8_optimization.py +++ b/wan/utils/fp8_optimization.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """Modified from https://github.com/kijai/ComfyUI-MochiWrapper """ import torch diff --git a/wan/utils/language_utils.py b/wan/utils/language_utils.py new file mode 100755 index 0000000..335f4a0 --- /dev/null +++ b/wan/utils/language_utils.py @@ -0,0 +1,724 @@ +#!/usr/bin/env python3 +""" +Language detection and localization utilities for StableAvatar. +Provides browser language detection and interface localization. +""" + +import re +from typing import Dict, Any, Optional, List + + +def detect_browser_language(accept_language_header: str) -> str: + """ + Detect the preferred language from browser Accept-Language header. + + Args: + accept_language_header: The Accept-Language header from the browser + + Returns: + str: Detected language code ('zh' for Chinese, 'en' for English, 'es' for Spanish, 'de' for German, 'ja' for Japanese, 'fr' for French, 'pt' for Portuguese, 'ru' for Russian, default 'zh') + """ + if not accept_language_header: + return "zh" # Default to Chinese + + # Parse Accept-Language header (e.g., "zh-CN,zh;q=0.9,en;q=0.8") + languages = [] + for lang_part in accept_language_header.split(','): + lang_part = lang_part.strip() + if ';' in lang_part: + lang, quality = lang_part.split(';', 1) + quality = float(quality.split('=')[1]) if 'q=' in quality else 1.0 + else: + lang = lang_part + quality = 1.0 + + # Extract language code (e.g., "zh-CN" -> "zh", "en-US" -> "en") + lang_code = lang.split('-')[0].lower() + languages.append((lang_code, quality)) + + # Sort by quality (higher first) + languages.sort(key=lambda x: x[1], reverse=True) + + # Check for supported languages in order of preference + for lang_code, _ in languages: + if lang_code in ['zh', 'zh-cn', 'zh-tw', 'zh-hk']: + return "zh" + elif lang_code in ['en', 'en-us', 'en-gb', 'en-ca', 'en-au']: + return "en" + elif lang_code in ['es', 'es-es', 'es-mx', 'es-ar', 'es-co', 'es-pe', 'es-ve', 'es-cl', 'es-ec', 'es-gt', 'es-cu', 'es-bo', 'es-do', 'es-hn', 'es-py', 'es-sv', 'es-ni', 'es-cr', 'es-pa', 'es-pr', 'es-uy']: + return "es" + elif lang_code in ['de', 'de-de', 'de-at', 'de-ch', 'de-li', 'de-lu', 'de-be']: + return "de" + elif lang_code in ['ja', 'ja-jp']: + return "ja" + elif lang_code in ['fr', 'fr-fr', 'fr-ca', 'fr-be', 'fr-ch', 'fr-lu', 'fr-mc', 'fr-sn', 'fr-ci', 'fr-cm', 'fr-mg', 'fr-cd', 'fr-dj', 'fr-gn', 'fr-ml', 'fr-ne', 'fr-rw', 'fr-td', 'fr-tg', 'fr-bf', 'fr-bi', 'fr-km', 'fr-cf', 'fr-ga', 'fr-gq', 'fr-mr', 'fr-vu', 'fr-nc', 'fr-pf', 'fr-wf', 'fr-yt']: + return "fr" + elif lang_code in ['pt', 'pt-br', 'pt-pt', 'pt-ao', 'pt-mz', 'pt-cv', 'pt-gw', 'pt-st', 'pt-tl']: + return "pt" + elif lang_code in ['ru', 'ru-ru', 'ru-by', 'ru-kz', 'ru-kg', 'ru-tj', 'ru-tm', 'ru-uz', 'ru-md', 'ru-ua', 'ru-am', 'ru-az', 'ru-ge']: + return "ru" + + # Default to Chinese if no match + return "zh" + + +def get_language_from_request(request) -> str: + """ + Extract language preference from a Gradio request object. + + Args: + request: Gradio request object + + Returns: + str: Detected language ('zh' or 'en') + """ + try: + # Try to get Accept-Language header + accept_language = request.headers.get('Accept-Language', '') + return detect_browser_language(accept_language) + except: + return "zh" # Default fallback + + +def get_interface_texts(language: str) -> Dict[str, Dict[str, str]]: + """ + Get interface texts for the specified language. + + Args: + language: Language code ('zh', 'en', 'es', 'de', 'ja', 'fr', 'pt', or 'ru') + + Returns: + Dict containing all interface texts + """ + if language == "en": + return { + "main": { + "title": "StableAvatar", + "device_info": "Running on: {device_summary} | Device: {device} | Data Type: {dtype}", + "language_label": "Language", + "model_settings": "Model Settings", + "video_generation": "Video Generation", + "audio_extraction": "Audio Extraction", + "vocal_separation": "Vocal Separation" + }, + "model_settings": { + "gpu_memory_mode": "GPU Memory Mode", + "gpu_memory_info": "Normal uses 25G VRAM, model_cpu_offload uses 13G VRAM", + "teacache_threshold": "TeaCache Threshold", + "teacache_info": "Recommended 0.1, 0 disables TeaCache acceleration", + "num_skip_start_steps": "Skip Start Steps", + "skip_steps_info": "Recommended 5", + "clip_sample_n_frames": "Clip Sample Frames", + "clip_frames_info": "Video frames, 81=2s@25fps, 161=4s@25fps, must be 4n+1", + "model_selection": "Transformer Model", + "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)", + "model_selection": "Transformer Model", + "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)", + "model_selection": "Transformer Model", + "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)", + "model_selection": "Transformer Model", + "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)", + "model_selection": "Transformer Model", + "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)", + "model_selection": "Transformer Model", + "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)", + "model_selection": "Transformer Model", + "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)" + }, + "video_generation": { + "upload_image": "Upload Image", + "upload_audio": "Upload Audio", + "prompt": "Prompt", + "negative_prompt": "Negative Prompt", + "negative_prompt_default": "vivid colors, overexposed, static, blurry details, subtitles, style, artwork, painting, still image, overall gray, worst quality, low quality, JPEG compression artifacts, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn face, deformed, disfigured, malformed limbs, fused fingers, static image, cluttered background, three legs, many people in background, walking backwards", + "start_generation": "🎬 Start Generation", + "width": "Width", + "height": "Height", + "swap_dimensions": "🔄 Swap Width/Height", + "adjust_size": "Adjust Size Based on Image", + "guidance_scale": "Guidance Scale", + "sampling_steps": "Sampling Steps (Recommended 50)", + "text_guide_scale": "Text Guidance Scale", + "audio_guide_scale": "Audio Guidance Scale", + "motion_frame": "Motion Frame", + "fps": "FPS", + "overlap_window_length": "Overlap Window Length", + "seed": "Seed (positive integer, -1 for random)", + "status": "Status", + "generated_result": "Generated Result", + "seed_output": "Seed" + }, + "audio_extraction": { + "upload_video": "Upload Video", + "start_extraction": "🎬 Start Extraction", + "status": "Status", + "generated_result": "Generated Result" + }, + "vocal_separation": { + "upload_audio": "Upload Audio", + "start_separation": "🎬 Start Separation", + "status": "Status", + "generated_result": "Generated Result" + } + } + elif language == "es": # Spanish + return { + "main": { + "title": "StableAvatar", + "device_info": "Ejecutando en: {device_summary} | Dispositivo: {device} | Tipo de datos: {dtype}", + "language_label": "Idioma", + "model_settings": "Configuración del Modelo", + "video_generation": "Generación de Video", + "audio_extraction": "Extracción de Audio", + "vocal_separation": "Separación Vocal" + }, + "model_settings": { + "gpu_memory_mode": "Modo de Memoria GPU", + "gpu_memory_info": "Normal usa 25G VRAM, model_cpu_offload usa 13G VRAM", + "teacache_threshold": "Umbral TeaCache", + "teacache_info": "Recomendado 0.1, 0 desactiva la aceleración TeaCache", + "num_skip_start_steps": "Omitir Pasos Iniciales", + "skip_steps_info": "Recomendado 5", + "clip_sample_n_frames": "Frames de Muestra Clip", + "clip_frames_info": "Frames de video, 81=2s@25fps, 161=4s@25fps, debe ser 4n+1", + "model_selection": "Modelo Transformer", + "model_selection_info": "Elige el tipo de modelo transformer: Square (estándar) o Rec-Vec (recomendado)" + }, + "video_generation": { + "upload_image": "Subir Imagen", + "upload_audio": "Subir Audio", + "prompt": "Prompt", + "negative_prompt": "Prompt Negativo", + "negative_prompt_default": "colores vivos, sobreexpuesto, estático, detalles borrosos, subtítulos, estilo, obra de arte, pintura, imagen fija, gris general, peor calidad, baja calidad, artefactos de compresión JPEG, feo, incompleto, dedos extra, manos mal dibujadas, cara mal dibujada, deforme, desfigurado, extremidades malformadas, dedos fusionados, imagen estática, fondo desordenado, tres piernas, mucha gente en el fondo, caminando hacia atrás", + "start_generation": "🎬 Iniciar Generación", + "width": "Ancho", + "height": "Alto", + "swap_dimensions": "🔄 Intercambiar Ancho/Alto", + "adjust_size": "Ajustar Tamaño Basado en Imagen", + "guidance_scale": "Escala de Guía", + "sampling_steps": "Pasos de Muestreo (Recomendado 50)", + "text_guide_scale": "Escala de Guía de Texto", + "audio_guide_scale": "Escala de Guía de Audio", + "motion_frame": "Frame de Movimiento", + "fps": "FPS", + "overlap_window_length": "Longitud de Ventana de Solapamiento", + "seed": "Semilla (entero positivo, -1 para aleatorio)", + "status": "Estado", + "generated_result": "Resultado Generado", + "seed_output": "Semilla" + }, + "audio_extraction": { + "upload_video": "Subir Video", + "start_extraction": "🎬 Iniciar Extracción", + "status": "Estado", + "generated_result": "Resultado Generado" + }, + "vocal_separation": { + "upload_audio": "Subir Audio", + "start_separation": "🎬 Iniciar Separación", + "status": "Estado", + "generated_result": "Resultado Generado" + } + } + elif language == "de": # German + return { + "main": { + "title": "StableAvatar", + "device_info": "Läuft auf: {device_summary} | Gerät: {device} | Datentyp: {dtype}", + "language_label": "Sprache", + "model_settings": "Modelleinstellungen", + "video_generation": "Video-Generierung", + "audio_extraction": "Audio-Extraktion", + "vocal_separation": "Gesangstrennung" + }, + "model_settings": { + "gpu_memory_mode": "GPU-Speichermodus", + "gpu_memory_info": "Normal verwendet 25G VRAM, model_cpu_offload verwendet 13G VRAM", + "teacache_threshold": "TeaCache-Schwellenwert", + "teacache_info": "Empfohlen 0.1, 0 deaktiviert TeaCache-Beschleunigung", + "num_skip_start_steps": "Startschritte Überspringen", + "skip_steps_info": "Empfohlen 5", + "clip_sample_n_frames": "Clip-Sample-Frames", + "clip_frames_info": "Video-Frames, 81=2s@25fps, 161=4s@25fps, muss 4n+1 sein", + "model_selection": "Transformer-Modell", + "model_selection_info": "Wählen Sie den Transformer-Modelltyp: Square (Standard) oder Rec-Vec (empfohlen)" + }, + "video_generation": { + "upload_image": "Bild Hochladen", + "upload_audio": "Audio Hochladen", + "prompt": "Prompt", + "negative_prompt": "Negativer Prompt", + "negative_prompt_default": "lebendige Farben, überbelichtet, statisch, unscharfe Details, Untertitel, Stil, Kunstwerk, Gemälde, Standbild, insgesamt grau, schlechteste Qualität, niedrige Qualität, JPEG-Komprimierungsartefakte, hässlich, unvollständig, zusätzliche Finger, schlecht gezeichnete Hände, schlecht gezeichnetes Gesicht, deformiert, entstellt, missgebildete Gliedmaßen, verschmolzene Finger, statisches Bild, unordentlicher Hintergrund, drei Beine, viele Menschen im Hintergrund, rückwärts gehend", + "start_generation": "🎬 Generierung Starten", + "width": "Breite", + "height": "Höhe", + "swap_dimensions": "🔄 Breite/Höhe Tauschen", + "adjust_size": "Größe Basierend auf Bild Anpassen", + "guidance_scale": "Führungsskala", + "sampling_steps": "Sampling-Schritte (Empfohlen 50)", + "text_guide_scale": "Text-Führungsskala", + "audio_guide_scale": "Audio-Führungsskala", + "motion_frame": "Bewegungsframe", + "fps": "FPS", + "overlap_window_length": "Überlappungsfenster-Länge", + "seed": "Seed (positive Ganzzahl, -1 für zufällig)", + "status": "Status", + "generated_result": "Generiertes Ergebnis", + "seed_output": "Seed" + }, + "audio_extraction": { + "upload_video": "Video Hochladen", + "start_extraction": "🎬 Extraktion Starten", + "status": "Status", + "generated_result": "Generiertes Ergebnis" + }, + "vocal_separation": { + "upload_audio": "Audio Hochladen", + "start_separation": "🎬 Trennung Starten", + "status": "Status", + "generated_result": "Generiertes Ergebnis" + } + } + elif language == "ja": # Japanese + return { + "main": { + "title": "StableAvatar", + "device_info": "実行環境: {device_summary} | デバイス: {device} | データ型: {dtype}", + "language_label": "言語", + "model_settings": "モデル設定", + "video_generation": "動画生成", + "audio_extraction": "音声抽出", + "vocal_separation": "ボーカル分離" + }, + "model_settings": { + "gpu_memory_mode": "GPUメモリモード", + "gpu_memory_info": "Normalは25G VRAM、model_cpu_offloadは13G VRAMを使用", + "teacache_threshold": "TeaCache閾値", + "teacache_info": "推奨値0.1、0でTeaCache加速を無効化", + "num_skip_start_steps": "開始ステップをスキップ", + "skip_steps_info": "推奨値5", + "clip_sample_n_frames": "Clipサンプルフレーム", + "clip_frames_info": "動画フレーム、81=2秒@25fps、161=4秒@25fps、4n+1である必要があります", + "model_selection": "Transformerモデル", + "model_selection_info": "Transformerモデルタイプを選択: Square(標準)またはRec-Vec(推奨)" + }, + "video_generation": { + "upload_image": "画像をアップロード", + "upload_audio": "音声をアップロード", + "prompt": "プロンプト", + "negative_prompt": "ネガティブプロンプト", + "negative_prompt_default": "鮮やかな色、露出オーバー、静止、ぼやけた詳細、字幕、スタイル、アートワーク、絵画、静止画像、全体的にグレー、最悪の品質、低品質、JPEG圧縮アーティファクト、醜い、不完全、余分な指、不適切に描かれた手、不適切に描かれた顔、変形、破損、奇形の手足、融合した指、静止画像、乱雑な背景、3本足、背景に多くの人、後ろ向きに歩く", + "start_generation": "🎬 生成開始", + "width": "幅", + "height": "高さ", + "swap_dimensions": "🔄 幅/高さを交換", + "adjust_size": "画像に基づいてサイズを調整", + "guidance_scale": "ガイダンススケール", + "sampling_steps": "サンプリングステップ(推奨50)", + "text_guide_scale": "テキストガイダンススケール", + "audio_guide_scale": "音声ガイダンススケール", + "motion_frame": "モーションフレーム", + "fps": "FPS", + "overlap_window_length": "オーバーラップウィンドウ長", + "seed": "シード(正の整数、-1でランダム)", + "status": "ステータス", + "generated_result": "生成結果", + "seed_output": "シード" + }, + "audio_extraction": { + "upload_video": "動画をアップロード", + "start_extraction": "🎬 抽出開始", + "status": "ステータス", + "generated_result": "生成結果" + }, + "vocal_separation": { + "upload_audio": "音声をアップロード", + "start_separation": "🎬 分離開始", + "status": "ステータス", + "generated_result": "生成結果" + } + } + elif language == "fr": # French + return { + "main": { + "title": "StableAvatar", + "device_info": "Exécution sur: {device_summary} | Appareil: {device} | Type de données: {dtype}", + "language_label": "Langue", + "model_settings": "Paramètres du Modèle", + "video_generation": "Génération Vidéo", + "audio_extraction": "Extraction Audio", + "vocal_separation": "Séparation Vocale" + }, + "model_settings": { + "gpu_memory_mode": "Mode Mémoire GPU", + "gpu_memory_info": "Normal utilise 25G VRAM, model_cpu_offload utilise 13G VRAM", + "teacache_threshold": "Seuil TeaCache", + "teacache_info": "Recommandé 0.1, 0 désactive l'accélération TeaCache", + "num_skip_start_steps": "Ignorer les Étapes Initiales", + "skip_steps_info": "Recommandé 5", + "clip_sample_n_frames": "Images d'Échantillon Clip", + "clip_frames_info": "Images vidéo, 81=2s@25fps, 161=4s@25fps, doit être 4n+1", + "model_selection": "Modèle Transformer", + "model_selection_info": "Choisissez le type de modèle transformer: Square (standard) ou Rec-Vec (recommandé)" + }, + "video_generation": { + "upload_image": "Télécharger Image", + "upload_audio": "Télécharger Audio", + "prompt": "Prompt", + "negative_prompt": "Prompt Négatif", + "negative_prompt_default": "couleurs vives, surexposé, statique, détails flous, sous-titres, style, œuvre d'art, peinture, image fixe, gris général, pire qualité, basse qualité, artefacts de compression JPEG, laid, incomplet, doigts supplémentaires, mains mal dessinées, visage mal dessiné, déformé, défiguré, membres malformés, doigts fusionnés, image statique, arrière-plan encombré, trois jambes, beaucoup de gens en arrière-plan, marchant à reculons", + "start_generation": "🎬 Démarrer Génération", + "width": "Largeur", + "height": "Hauteur", + "swap_dimensions": "🔄 Échanger Largeur/Hauteur", + "adjust_size": "Ajuster Taille Basée sur Image", + "guidance_scale": "Échelle de Guidage", + "sampling_steps": "Étapes d'Échantillonnage (Recommandé 50)", + "text_guide_scale": "Échelle de Guidage Texte", + "audio_guide_scale": "Échelle de Guidage Audio", + "motion_frame": "Image de Mouvement", + "fps": "FPS", + "overlap_window_length": "Longueur Fenêtre de Chevauchement", + "seed": "Graine (entier positif, -1 pour aléatoire)", + "status": "Statut", + "generated_result": "Résultat Généré", + "seed_output": "Graine" + }, + "audio_extraction": { + "upload_video": "Télécharger Vidéo", + "start_extraction": "🎬 Démarrer Extraction", + "status": "Statut", + "generated_result": "Résultat Généré" + }, + "vocal_separation": { + "upload_audio": "Télécharger Audio", + "start_separation": "🎬 Démarrer Séparation", + "status": "Statut", + "generated_result": "Résultat Généré" + } + } + elif language == "pt": # Portuguese + return { + "main": { + "title": "StableAvatar", + "device_info": "Executando em: {device_summary} | Dispositivo: {device} | Tipo de dados: {dtype}", + "language_label": "Idioma", + "model_settings": "Configurações do Modelo", + "video_generation": "Geração de Vídeo", + "audio_extraction": "Extração de Áudio", + "vocal_separation": "Separação Vocal" + }, + "model_settings": { + "gpu_memory_mode": "Modo de Memória GPU", + "gpu_memory_info": "Normal usa 25G VRAM, model_cpu_offload usa 13G VRAM", + "teacache_threshold": "Limite TeaCache", + "teacache_info": "Recomendado 0.1, 0 desativa aceleração TeaCache", + "num_skip_start_steps": "Pular Passos Iniciais", + "skip_steps_info": "Recomendado 5", + "clip_sample_n_frames": "Quadros de Amostra Clip", + "clip_frames_info": "Quadros de vídeo, 81=2s@25fps, 161=4s@25fps, deve ser 4n+1", + "model_selection": "Modelo Transformer", + "model_selection_info": "Escolha o tipo de modelo transformer: Square (padrão) ou Rec-Vec (recomendado)" + }, + "video_generation": { + "upload_image": "Carregar Imagem", + "upload_audio": "Carregar Áudio", + "prompt": "Prompt", + "negative_prompt": "Prompt Negativo", + "negative_prompt_default": "cores vivas, superexposto, estático, detalhes borrados, legendas, estilo, obra de arte, pintura, imagem fixa, cinza geral, pior qualidade, baixa qualidade, artefatos de compressão JPEG, feio, incompleto, dedos extras, mãos mal desenhadas, rosto mal desenhado, deformado, desfigurado, membros malformados, dedos fundidos, imagem estática, fundo desordenado, três pernas, muitas pessoas no fundo, andando para trás", + "start_generation": "🎬 Iniciar Geração", + "width": "Largura", + "height": "Altura", + "swap_dimensions": "🔄 Trocar Largura/Altura", + "adjust_size": "Ajustar Tamanho Baseado na Imagem", + "guidance_scale": "Escala de Orientação", + "sampling_steps": "Passos de Amostragem (Recomendado 50)", + "text_guide_scale": "Escala de Orientação de Texto", + "audio_guide_scale": "Escala de Orientação de Áudio", + "motion_frame": "Quadro de Movimento", + "fps": "FPS", + "overlap_window_length": "Comprimento da Janela de Sobreposição", + "seed": "Semente (inteiro positivo, -1 para aleatório)", + "status": "Status", + "generated_result": "Resultado Gerado", + "seed_output": "Semente" + }, + "audio_extraction": { + "upload_video": "Carregar Vídeo", + "start_extraction": "🎬 Iniciar Extração", + "status": "Status", + "generated_result": "Resultado Gerado" + }, + "vocal_separation": { + "upload_audio": "Carregar Áudio", + "start_separation": "🎬 Iniciar Separação", + "status": "Status", + "generated_result": "Resultado Gerado" + } + } + elif language == "ru": # Russian + return { + "main": { + "title": "StableAvatar", + "device_info": "Запуск на: {device_summary} | Устройство: {device} | Тип данных: {dtype}", + "language_label": "Язык", + "model_settings": "Настройки Модели", + "video_generation": "Генерация Видео", + "audio_extraction": "Извлечение Аудио", + "vocal_separation": "Разделение Вокала" + }, + "model_settings": { + "gpu_memory_mode": "Режим Памяти GPU", + "gpu_memory_info": "Normal использует 25G VRAM, model_cpu_offload использует 13G VRAM", + "teacache_threshold": "Порог TeaCache", + "teacache_info": "Рекомендуется 0.1, 0 отключает ускорение TeaCache", + "num_skip_start_steps": "Пропустить Начальные Шаги", + "skip_steps_info": "Рекомендуется 5", + "clip_sample_n_frames": "Кадры Образца Clip", + "clip_frames_info": "Видеокадры, 81=2с@25fps, 161=4с@25fps, должно быть 4n+1", + "model_selection": "Модель Transformer", + "model_selection_info": "Выберите тип модели transformer: Square (стандартная) или Rec-Vec (рекомендуемая)" + }, + "video_generation": { + "upload_image": "Загрузить Изображение", + "upload_audio": "Загрузить Аудио", + "prompt": "Промпт", + "negative_prompt": "Негативный Промпт", + "negative_prompt_default": "яркие цвета, переэкспонированный, статичный, размытые детали, субтитры, стиль, произведение искусства, живопись, неподвижное изображение, общий серый, худшее качество, низкое качество, артефакты сжатия JPEG, уродливый, неполный, лишние пальцы, плохо нарисованные руки, плохо нарисованное лицо, деформированный, обезображенный, неправильно сформированные конечности, сросшиеся пальцы, статичное изображение, загроможденный фон, три ноги, много людей на фоне, идущий назад", + "start_generation": "🎬 Начать Генерацию", + "width": "Ширина", + "height": "Высота", + "swap_dimensions": "🔄 Поменять Ширину/Высоту", + "adjust_size": "Настроить Размер на Основе Изображения", + "guidance_scale": "Шкала Направления", + "sampling_steps": "Шаги Семплирования (Рекомендуется 50)", + "text_guide_scale": "Шкала Направления Текста", + "audio_guide_scale": "Шкала Направления Аудио", + "motion_frame": "Кадр Движения", + "fps": "FPS", + "overlap_window_length": "Длина Окна Перекрытия", + "seed": "Семя (положительное целое, -1 для случайного)", + "status": "Статус", + "generated_result": "Сгенерированный Результат", + "seed_output": "Семя" + }, + "audio_extraction": { + "upload_video": "Загрузить Видео", + "start_extraction": "🎬 Начать Извлечение", + "status": "Статус", + "generated_result": "Сгенерированный Результат" + }, + "vocal_separation": { + "upload_audio": "Загрузить Аудио", + "start_separation": "🎬 Начать Разделение", + "status": "Статус", + "generated_result": "Сгенерированный Результат" + } + } + else: # Chinese (zh) + return { + "main": { + "title": "StableAvatar", + "device_info": "运行环境: {device_summary} | 设备: {device} | 数据类型: {dtype}", + "language_label": "语言", + "model_settings": "模型设置", + "video_generation": "视频生成", + "audio_extraction": "音频提取", + "vocal_separation": "人声分离" + }, + "model_settings": { + "gpu_memory_mode": "显存模式", + "gpu_memory_info": "Normal占用25G显存,model_cpu_offload占用13G显存", + "teacache_threshold": "teacache threshold", + "teacache_info": "推荐参数0.1,0为禁用teacache加速", + "num_skip_start_steps": "跳过开始步数", + "skip_steps_info": "推荐参数5", + "clip_sample_n_frames": "Clip采样帧数", + "clip_frames_info": "视频帧数,81=2秒@25fps,161=4秒@25fps,必须为4n+1", + "model_selection": "Transformer模型", + "model_selection_info": "选择transformer模型类型:Square(标准)或Rec-Vec(推荐)" + }, + "video_generation": { + "upload_image": "上传图片", + "upload_audio": "上传音频", + "prompt": "提示词", + "negative_prompt": "负面提示词", + "negative_prompt_default": "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", + "start_generation": "🎬 开始生成", + "width": "宽度", + "height": "高度", + "swap_dimensions": "🔄 交换宽高", + "adjust_size": "根据图片调整宽高", + "guidance_scale": "guidance scale", + "sampling_steps": "采样步数(推荐50步)", + "text_guide_scale": "text guidance scale", + "audio_guide_scale": "audio guidance scale", + "motion_frame": "motion frame", + "fps": "帧率", + "overlap_window_length": "overlap window length", + "seed": "种子,请输入正整数,-1为随机", + "status": "提示信息", + "generated_result": "生成结果", + "seed_output": "种子" + }, + "audio_extraction": { + "upload_video": "上传视频", + "start_extraction": "🎬 开始提取", + "status": "提示信息", + "generated_result": "生成结果" + }, + "vocal_separation": { + "upload_audio": "上传音频", + "start_separation": "🎬 开始分离", + "status": "提示信息", + "generated_result": "生成结果" + } + } + + +def get_display_language(language_code: str) -> str: + """ + Convert language code to display name. + + Args: + language_code: Language code ('zh', 'en', 'es', 'de', 'ja', 'fr', 'pt', or 'ru') + + Returns: + str: Display name ('中文', 'English', 'Español', 'Deutsch', '日本語', 'Français', 'Português', or 'Русский') + """ + language_map = { + "zh": "中文", + "en": "English", + "es": "Español", + "de": "Deutsch", + "ja": "日本語", + "fr": "Français", + "pt": "Português", + "ru": "Русский" + } + return language_map.get(language_code, "中文") + + +def get_language_choices() -> List[tuple]: + """ + Get language choices for Gradio Radio component. + + Returns: + List of tuples (display_name, language_code) + """ + return [ + ("中文", "zh"), + ("English", "en"), + ("Español", "es"), + ("Deutsch", "de"), + ("日本語", "ja"), + ("Français", "fr"), + ("Português", "pt"), + ("Русский", "ru") + ] + + +def create_language_detection_js() -> str: + """ + Create simple JavaScript code for client-side language detection. + This version only sets the radio button without triggering events to avoid conflicts. + + Returns: + str: JavaScript code for language detection + """ + return """ + + """ + + +if __name__ == "__main__": + # Test the language detection + test_headers = [ + "zh-CN,zh;q=0.9,en;q=0.8", + "en-US,en;q=0.9", + "es-ES,es;q=0.9,en;q=0.8", + "de-DE,de;q=0.9,en;q=0.8", + "ja-JP,ja;q=0.9,en;q=0.8", + "fr-FR,fr;q=0.9,en;q=0.8", + "pt-BR,pt;q=0.9,en;q=0.8", + "ru-RU,ru;q=0.9,en;q=0.8", + "zh-TW,zh;q=0.9" + ] + + print("Testing language detection:") + for header in test_headers: + detected = detect_browser_language(header) + display_name = get_display_language(detected) + print(f"Header: {header} -> Detected: {detected} ({display_name})") + + print("\nTesting interface texts:") + for lang in ["zh", "en", "es", "de", "ja", "fr", "pt", "ru"]: + texts = get_interface_texts(lang) + display_name = get_display_language(lang) + print(f"\n{display_name} ({lang.upper()}):") + print(f" Title: {texts['main']['title']}") + print(f" Language Label: {texts['main']['language_label']}") + print(f" Start Generation: {texts['video_generation']['start_generation']}") + print(f" Upload Image: {texts['video_generation']['upload_image']}") + + print("\nTesting language choices:") + choices = get_language_choices() + for display_name, lang_code in choices: + print(f" {display_name} -> {lang_code}") diff --git a/wan/utils/lora_utils.py b/wan/utils/lora_utils.py old mode 100644 new mode 100755 index 9748364..7a2f698 --- a/wan/utils/lora_utils.py +++ b/wan/utils/lora_utils.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import hashlib import math import os diff --git a/wan/utils/prompt_extend.py b/wan/utils/prompt_extend.py old mode 100644 new mode 100755 index f3981a8..cf81f75 --- a/wan/utils/prompt_extend.py +++ b/wan/utils/prompt_extend.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import json import math diff --git a/wan/utils/qwen_vl_utils.py b/wan/utils/qwen_vl_utils.py old mode 100644 new mode 100755 index 3c682e6..fe6619e --- a/wan/utils/qwen_vl_utils.py +++ b/wan/utils/qwen_vl_utils.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copied from https://github.com/kq-chen/qwen-vl-utils # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. from __future__ import annotations diff --git a/wan/utils/utils.py b/wan/utils/utils.py old mode 100644 new mode 100755 index 4cb9052..97c68ac --- a/wan/utils/utils.py +++ b/wan/utils/utils.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. import argparse import binascii From 7defa77396ed2a1cfaea3cbfdfe2c8d0e2ea6014 Mon Sep 17 00:00:00 2001 From: jaminmc <1310376+jaminmc@users.noreply.github.com> Date: Sun, 14 Sep 2025 22:57:05 -0400 Subject: [PATCH 2/2] Update .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index a7797d5..20e99aa 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ checkpoints # Generated outputs outputs/ -requirements.txt # Virtual environments venv/