From 479c2a6ad5735138130abc40c2fb283372992817 Mon Sep 17 00:00:00 2001
From: jaminmc <1310376+jaminmc@users.noreply.github.com>
Date: Sun, 14 Sep 2025 20:50:12 -0400
Subject: [PATCH 1/2] Add multi-language interface support and gitignore
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add comprehensive multi-language support (Chinese, English, Spanish, German, Japanese, French, Portuguese, Russian)
- Add language detection and localization utilities in wan/utils/language_utils.py
- Update gradio interface with dynamic language switching
- Add standard Python .gitignore file
- Replace hardcoded 'path/StableAvatar' with dynamic SCRIPT_DIR variable
- Use $(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) to get script location
- Update all path references in inference.sh and multiple_gpu_inference.sh
- Makes scripts portable and work regardless of installation location

Optimize requirements.txt installation order for efficiency

- Reorganize packages by dependency hierarchy and installation efficiency
- Group packages logically: build tools → PyTorch → computer vision → ML frameworks → audio/video → utilities
- Install large dependencies (PyTorch ecosystem) first to avoid conflicts
- Install core numerical libraries (numpy) early as many packages depend on them
- Add clear section comments explaining the installation strategy
- Remove tokenizers (automatically installed as dependency of transformers)
- Remove imageio-ffmpeg (automatically installed as dependency of imageio[ffmpeg])
- Streamline requirements.txt by removing packages that are automatically installed
- Add documentation for optional packages that enhance performance/features

Add comprehensive optional packages section to README

- Add new 'Optional Packages for Enhanced Performance' section
- Document all optional packages: flash-attn, xformers, bitsandbytes, audio-separator[gpu], decord
- Include clear descriptions of what each package provides
- Add note about automatic fallbacks when packages are not installed
- Update existing flash_attn references for consistency
- Help users understand which optional packages to install for their use case
---
 .gitignore                                    | 131 ++++
 README.md                                     |  44 +-
 app.py                                        | 240 +++---
 audio_extractor.py                            |   1 +
 inference.py                                  |   1 +
 inference.sh                                  |  16 +-
 lip_mask_extractor.py                         |   1 +
 multiple_gpu_inference.sh                     |  16 +-
 requirements.txt                              |  83 +-
 train_14B.py                                  |   1 +
 train_14B.sh                                  |   1 +
 train_14B_lora.py                             |   1 +
 train_14B_lora.sh                             |   1 +
 train_1B_rec_vec.py                           |   1 +
 train_1B_rec_vec.sh                           |   1 +
 train_1B_rec_vec_64.sh                        |   1 +
 train_1B_rec_vec_lora.py                      |   1 +
 train_1B_rec_vec_lora.sh                      |   1 +
 train_1B_rec_vec_lora_64.sh                   |   1 +
 train_1B_square.py                            |   1 +
 train_1B_square.sh                            |   1 +
 train_1B_square_64.sh                         |   1 +
 vocal_seperator.py                            |   1 +
 wan/__init__.py                               |   1 +
 wan/configs/__init__.py                       |   1 +
 wan/configs/shared_config.py                  |   1 +
 wan/configs/wan_i2v_14B.py                    |   1 +
 wan/configs/wan_t2v_14B.py                    |   1 +
 wan/configs/wan_t2v_1_3B.py                   |   1 +
 wan/dataset/talking_video_dataset_fantasy.py  |   1 +
 wan/dist/__init__.py                          |   1 +
 wan/dist/wan_xfuser.py                        |   1 +
 wan/distributed/__init__.py                   |   0
 wan/distributed/fsdp.py                       |   1 +
 wan/distributed/xdit_context_parallel.py      |   1 +
 wan/image2video.py                            |   1 +
 wan/models/__init__.py                        |   0
 wan/models/attention_processor.py             |   1 +
 wan/models/cache_utils.py                     |   1 +
 wan/models/motion_controller.py               |   1 +
 wan/models/motion_to_bucket.py                |   1 +
 wan/models/vocal_projector_fantasy.py         |   1 +
 wan/models/vocal_projector_fantasy_14B.py     |   1 +
 wan/models/vocal_projector_fantasy_1B.py      |   1 +
 wan/models/wan_fantasy_transformer3d_14B.py   |   1 +
 wan/models/wan_fantasy_transformer3d_1B.py    |   1 +
 wan/models/wan_image_encoder.py               |   1 +
 wan/models/wan_text_encoder.py                |   1 +
 wan/models/wan_transformer3d.py               |   1 +
 wan/models/wan_vae.py                         |   1 +
 wan/models/wan_xlm_roberta.py                 |   1 +
 wan/models/wav2vec.py                         |   1 +
 wan/pipeline/__init__.py                      |   0
 wan/pipeline/pipeline_wan_fun_inpaint.py      |   1 +
 wan/pipeline/wan_inference_long_pipeline.py   |   1 +
 .../wan_inference_pipeline_fantasy.py         |   1 +
 wan/text2video.py                             |   1 +
 wan/utils/__init__.py                         |   1 +
 wan/utils/color_correction.py                 |   1 +
 wan/utils/discrete_sampler.py                 |   1 +
 wan/utils/fm_solvers.py                       |   1 +
 wan/utils/fm_solvers_unipc.py                 |   1 +
 wan/utils/fp8_optimization.py                 |   1 +
 wan/utils/language_utils.py                   | 724 ++++++++++++++++++
 wan/utils/lora_utils.py                       |   1 +
 wan/utils/prompt_extend.py                    |   1 +
 wan/utils/qwen_vl_utils.py                    |   1 +
 wan/utils/utils.py                            |   1 +
 68 files changed, 1151 insertions(+), 161 deletions(-)
 create mode 100644 .gitignore
 mode change 100644 => 100755 app.py
 mode change 100644 => 100755 audio_extractor.py
 mode change 100644 => 100755 inference.py
 mode change 100644 => 100755 inference.sh
 mode change 100644 => 100755 lip_mask_extractor.py
 mode change 100644 => 100755 multiple_gpu_inference.sh
 mode change 100644 => 100755 train_14B.py
 mode change 100644 => 100755 train_14B.sh
 mode change 100644 => 100755 train_14B_lora.py
 mode change 100644 => 100755 train_14B_lora.sh
 mode change 100644 => 100755 train_1B_rec_vec.py
 mode change 100644 => 100755 train_1B_rec_vec.sh
 mode change 100644 => 100755 train_1B_rec_vec_64.sh
 mode change 100644 => 100755 train_1B_rec_vec_lora.py
 mode change 100644 => 100755 train_1B_rec_vec_lora.sh
 mode change 100644 => 100755 train_1B_rec_vec_lora_64.sh
 mode change 100644 => 100755 train_1B_square.py
 mode change 100644 => 100755 train_1B_square.sh
 mode change 100644 => 100755 train_1B_square_64.sh
 mode change 100644 => 100755 vocal_seperator.py
 mode change 100644 => 100755 wan/__init__.py
 mode change 100644 => 100755 wan/configs/__init__.py
 mode change 100644 => 100755 wan/configs/shared_config.py
 mode change 100644 => 100755 wan/configs/wan_i2v_14B.py
 mode change 100644 => 100755 wan/configs/wan_t2v_14B.py
 mode change 100644 => 100755 wan/configs/wan_t2v_1_3B.py
 mode change 100644 => 100755 wan/dataset/talking_video_dataset_fantasy.py
 mode change 100644 => 100755 wan/dist/__init__.py
 mode change 100644 => 100755 wan/dist/wan_xfuser.py
 mode change 100644 => 100755 wan/distributed/__init__.py
 mode change 100644 => 100755 wan/distributed/fsdp.py
 mode change 100644 => 100755 wan/distributed/xdit_context_parallel.py
 mode change 100644 => 100755 wan/image2video.py
 mode change 100644 => 100755 wan/models/__init__.py
 mode change 100644 => 100755 wan/models/attention_processor.py
 mode change 100644 => 100755 wan/models/cache_utils.py
 mode change 100644 => 100755 wan/models/motion_controller.py
 mode change 100644 => 100755 wan/models/motion_to_bucket.py
 mode change 100644 => 100755 wan/models/vocal_projector_fantasy.py
 mode change 100644 => 100755 wan/models/vocal_projector_fantasy_14B.py
 mode change 100644 => 100755 wan/models/vocal_projector_fantasy_1B.py
 mode change 100644 => 100755 wan/models/wan_fantasy_transformer3d_14B.py
 mode change 100644 => 100755 wan/models/wan_fantasy_transformer3d_1B.py
 mode change 100644 => 100755 wan/models/wan_image_encoder.py
 mode change 100644 => 100755 wan/models/wan_text_encoder.py
 mode change 100644 => 100755 wan/models/wan_transformer3d.py
 mode change 100644 => 100755 wan/models/wan_vae.py
 mode change 100644 => 100755 wan/models/wan_xlm_roberta.py
 mode change 100644 => 100755 wan/models/wav2vec.py
 mode change 100644 => 100755 wan/pipeline/__init__.py
 mode change 100644 => 100755 wan/pipeline/pipeline_wan_fun_inpaint.py
 mode change 100644 => 100755 wan/pipeline/wan_inference_long_pipeline.py
 mode change 100644 => 100755 wan/pipeline/wan_inference_pipeline_fantasy.py
 mode change 100644 => 100755 wan/text2video.py
 mode change 100644 => 100755 wan/utils/__init__.py
 mode change 100644 => 100755 wan/utils/color_correction.py
 mode change 100644 => 100755 wan/utils/discrete_sampler.py
 mode change 100644 => 100755 wan/utils/fm_solvers.py
 mode change 100644 => 100755 wan/utils/fm_solvers_unipc.py
 mode change 100644 => 100755 wan/utils/fp8_optimization.py
 create mode 100755 wan/utils/language_utils.py
 mode change 100644 => 100755 wan/utils/lora_utils.py
 mode change 100644 => 100755 wan/utils/prompt_extend.py
 mode change 100644 => 100755 wan/utils/qwen_vl_utils.py
 mode change 100644 => 100755 wan/utils/utils.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a7797d5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,131 @@
+# Checkpoints and model files
+checkpoints/
+checkpoints
+
+# Generated outputs
+outputs/
+requirements.txt
+
+# Virtual environments
+venv/
+.venv/
+env/
+.env/
+
+# Python cache files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
diff --git a/README.md b/README.md
index 0ce50ca..00eeac2 100644
--- a/README.md
+++ b/README.md
@@ -96,22 +96,50 @@ For the basic version of the model checkpoint (Wan2.1-1.3B-based), it supports g
 
 ### 🧱 Environment setup
 
-```
+Choose the appropriate setup based on your hardware:
+
+#### CUDA 12.4 (RTX 40xx series and earlier)
+```bash
 pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu124
 pip install -r requirements.txt
-# Optional to install flash_attn to accelerate attention computation
-pip install flash_attn
+# Optional: install flash-attn for faster attention computation (NVIDIA only)
+pip install flash-attn
 ```
 
-### 🧱 Environment setup for Blackwell series chips
-
-```
+#### CUDA 12.8 (Blackwell series chips - RTX 50xx, H200, etc.)
+```bash
 pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128
 pip install -r requirements.txt
-# Optional to install flash_attn to accelerate attention computation
-pip install flash_attn
+# Optional: install flash-attn for faster attention computation (NVIDIA only)
+pip install flash-attn
+```
+
+#### CPU-only (macOS, Linux without GPU, or for testing)
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+pip install -r requirements.txt
+```
+
+### 🚀 Optional Packages for Enhanced Performance
+
+For better performance and additional features, you can install these optional packages:
+
+```bash
+# Memory efficient attention (alternative to flash-attn, works on more hardware)
+pip install xformers
+
+# 8-bit training optimization (for LoRA training)
+pip install bitsandbytes
+
+# Vocal separation functionality
+pip install audio-separator[gpu]
+
+# Faster video reading (not available on macOS, falls back to torchvision automatically)
+pip install decord
 ```
 
+**Note**: All these packages are optional. The system will automatically fall back to standard implementations if they're not installed. Install only the packages you need for your specific use case.
+
 ### 🧱 Download weights
 If you encounter connection issues with Hugging Face, you can utilize the mirror endpoint by setting the environment variable: `export HF_ENDPOINT=https://hf-mirror.com`.
 Please download weights manually as follows:
diff --git a/app.py b/app.py
old mode 100644
new mode 100755
index 7d84bd4..bb2fcc8
--- a/app.py
+++ b/app.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import torch
 import psutil
 import argparse
@@ -15,6 +16,10 @@
 from wan.pipeline.wan_inference_long_pipeline import WanI2VTalkingInferenceLongPipeline
 from wan.utils.fp8_optimization import replace_parameters_by_name, convert_weight_dtype_wrapper, convert_model_weight_to_float8
 from wan.utils.utils import get_image_to_video_latent, save_videos_grid
+from wan.utils.language_utils import (
+    detect_browser_language, get_interface_texts, get_display_language, 
+    get_language_choices, create_language_detection_js
+)
 import numpy as np
 import librosa
 import datetime
@@ -277,162 +282,165 @@ def vocal_separation(audio_path):
     return f"outputs/{timestamp}.wav", f"Generated outputs/{timestamp}.wav / 已生成outputs/{timestamp}.wav"
 
 
+def detect_and_set_language(request):
+    """Detect browser language and return appropriate language setting."""
+    try:
+        # Get Accept-Language header from request
+        accept_language = request.headers.get('Accept-Language', '')
+        detected_lang = detect_browser_language(accept_language)
+        return get_display_language(detected_lang)
+    except:
+        return "中文"  # Default fallback
+
+
 def update_language(language):
-    if language == "English":
-        return {
-            GPU_memory_mode: gr.Dropdown(label="GPU Memory Mode", info="Normal uses 25G VRAM, model_cpu_offload uses 13G VRAM"),
-            teacache_threshold: gr.Slider(label="TeaCache Threshold", info="Recommended 0.1, 0 disables TeaCache acceleration"),
-            num_skip_start_steps: gr.Slider(label="Skip Start Steps", info="Recommended 5"),
-            clip_sample_n_frames: gr.Slider(label="Clip Sample Frames", info="Video frames, 81=2s@25fps, 161=4s@25fps, must be 4n+1"),
-            image_path: gr.Image(label="Upload Image"),
-            audio_path: gr.Audio(label="Upload Audio"),
-            prompt: gr.Textbox(label="Prompt"),
-            negative_prompt: gr.Textbox(label="Negative Prompt"),
-            generate_button: gr.Button("🎬 Start Generation"),
-            width: gr.Slider(label="Width"),
-            height: gr.Slider(label="Height"),
-            exchange_button: gr.Button("🔄 Swap Width/Height"),
-            adjust_button: gr.Button("Adjust Size Based on Image"),
-            guidance_scale: gr.Slider(label="Guidance Scale"),
-            num_inference_steps: gr.Slider(label="Sampling Steps (Recommended 50)"),
-            text_guide_scale: gr.Slider(label="Text Guidance Scale"),
-            audio_guide_scale: gr.Slider(label="Audio Guidance Scale"),
-            motion_frame: gr.Slider(label="Motion Frame"),
-            fps: gr.Slider(label="FPS"),
-            overlap_window_length: gr.Slider(label="Overlap Window Length"),
-            seed_param: gr.Number(label="Seed (positive integer, -1 for random)"),
-            info: gr.Textbox(label="Status"),
-            video_output: gr.Video(label="Generated Result"),
-            seed_output: gr.Textbox(label="Seed"),
-            video_path: gr.Video(label="Upload Video"),
-            extractor_button: gr.Button("🎬 Start Extraction"),
-            info2: gr.Textbox(label="Status"),
-            audio_output: gr.Audio(label="Generated Result"),
-            audio_path3: gr.Audio(label="Upload Audio"),
-            separation_button: gr.Button("🎬 Start Separation"),
-            info3: gr.Textbox(label="Status"),
-            audio_output3: gr.Audio(label="Generated Result")
-        }
-    else:
-        return {
-            GPU_memory_mode: gr.Dropdown(label="显存模式", info="Normal占用25G显存，model_cpu_offload占用13G显存"),
-            teacache_threshold: gr.Slider(label="teacache threshold", info="推荐参数0.1，0为禁用teacache加速"),
-            num_skip_start_steps: gr.Slider(label="跳过开始步数", info="推荐参数5"),
-            clip_sample_n_frames: gr.Slider(label="Clip采样帧数", info="视频帧数，81=2秒@25fps，161=4秒@25fps，必须为4n+1"),
-            image_path: gr.Image(label="上传图片"),
-            audio_path: gr.Audio(label="上传音频"),
-            prompt: gr.Textbox(label="提示词"),
-            negative_prompt: gr.Textbox(label="负面提示词"),
-            generate_button: gr.Button("🎬 开始生成"),
-            width: gr.Slider(label="宽度"),
-            height: gr.Slider(label="高度"),
-            exchange_button: gr.Button("🔄 交换宽高"),
-            adjust_button: gr.Button("根据图片调整宽高"),
-            guidance_scale: gr.Slider(label="guidance scale"),
-            num_inference_steps: gr.Slider(label="采样步数（推荐50步）"),
-            text_guide_scale: gr.Slider(label="text guidance scale"),
-            audio_guide_scale: gr.Slider(label="audio guidance scale"),
-            motion_frame: gr.Slider(label="motion frame"),
-            fps: gr.Slider(label="帧率"),
-            overlap_window_length: gr.Slider(label="overlap window length"),
-            seed_param: gr.Number(label="种子，请输入正整数，-1为随机"),
-            info: gr.Textbox(label="提示信息"),
-            video_output: gr.Video(label="生成结果"),
-            seed_output: gr.Textbox(label="种子"),
-            video_path: gr.Video(label="上传视频"),
-            extractor_button: gr.Button("🎬 开始提取"),
-            info2: gr.Textbox(label="提示信息"),
-            audio_output: gr.Audio(label="生成结果"),
-            audio_path3: gr.Audio(label="上传音频"),
-            separation_button: gr.Button("🎬 开始分离"),
-            info3: gr.Textbox(label="提示信息"),
-            audio_output3: gr.Audio(label="生成结果")
-        }
+    """Update interface language based on user selection."""
+    # The language parameter is actually the language code (second element of tuple)
+    # So we can use it directly
+    lang_code = language  # language is already the code like 'es', 'de', 'ja', etc.
+    texts = get_interface_texts(lang_code)
+    
+    # Return component updates in the same order as all_components
+    return [
+        gr.Markdown(f"""
+            <div>
+                <h2 style="font-size: 30px;text-align: center;">{texts['main']['title']}</h2>
+            </div>
+            """),
+        gr.Dropdown(label=texts["model_settings"]["gpu_memory_mode"], info=texts["model_settings"]["gpu_memory_info"]),
+        gr.Slider(label=texts["model_settings"]["teacache_threshold"], info=texts["model_settings"]["teacache_info"]),
+        gr.Slider(label=texts["model_settings"]["num_skip_start_steps"], info=texts["model_settings"]["skip_steps_info"]),
+        gr.Slider(label=texts["model_settings"]["clip_sample_n_frames"], info=texts["model_settings"]["clip_frames_info"]),
+        gr.Image(label=texts["video_generation"]["upload_image"]),
+        gr.Audio(label=texts["video_generation"]["upload_audio"]),
+        gr.Textbox(label=texts["video_generation"]["prompt"]),
+        gr.Textbox(label=texts["video_generation"]["negative_prompt"], value=texts["video_generation"]["negative_prompt_default"]),
+        gr.Button(texts["video_generation"]["start_generation"]),
+        gr.Slider(label=texts["video_generation"]["width"]),
+        gr.Slider(label=texts["video_generation"]["height"]),
+        gr.Button(texts["video_generation"]["swap_dimensions"]),
+        gr.Button(texts["video_generation"]["adjust_size"]),
+        gr.Slider(label=texts["video_generation"]["guidance_scale"]),
+        gr.Slider(label=texts["video_generation"]["sampling_steps"]),
+        gr.Slider(label=texts["video_generation"]["text_guide_scale"]),
+        gr.Slider(label=texts["video_generation"]["audio_guide_scale"]),
+        gr.Slider(label=texts["video_generation"]["motion_frame"]),
+        gr.Slider(label=texts["video_generation"]["fps"]),
+        gr.Slider(label=texts["video_generation"]["overlap_window_length"]),
+        gr.Number(label=texts["video_generation"]["seed"]),
+        gr.Textbox(label=texts["video_generation"]["status"]),
+        gr.Video(label=texts["video_generation"]["generated_result"]),
+        gr.Textbox(label=texts["video_generation"]["seed_output"]),
+        gr.Video(label=texts["audio_extraction"]["upload_video"]),
+        gr.Button(texts["audio_extraction"]["start_extraction"]),
+        gr.Textbox(label=texts["audio_extraction"]["status"]),
+        gr.Audio(label=texts["audio_extraction"]["generated_result"]),
+        gr.Audio(label=texts["vocal_separation"]["upload_audio"]),
+        gr.Button(texts["vocal_separation"]["start_separation"]),
+        gr.Textbox(label=texts["vocal_separation"]["status"]),
+        gr.Audio(label=texts["vocal_separation"]["generated_result"])
+    ]
+
 
+# Get initial language texts (default to English)
+initial_texts = get_interface_texts("en")
 
 with gr.Blocks(theme=gr.themes.Base()) as demo:
-    gr.Markdown("""
+    # Create dynamic device info component that updates with language
+    device_info_display = gr.Markdown(f"""
             <div>
-                <h2 style="font-size: 30px;text-align: center;">StableAvatar</h2>
+                <h2 style="font-size: 30px;text-align: center;">{initial_texts['main']['title']}</h2>
             </div>
             """)
     
+    # Set English as the default language (use language code since Radio uses codes as values)
+    default_language = "en"
+    
     language_radio = gr.Radio(
-        choices=["English", "中文"], 
-        value="中文", 
-        label="Language / 语言"
+        choices=get_language_choices(), 
+        value=default_language, 
+        label=initial_texts['main']['language_label']
     )
     
-    with gr.Accordion("Model Settings / 模型设置", open=False):
+    with gr.Accordion(initial_texts['main']['model_settings'], open=False):
         with gr.Row():
             GPU_memory_mode = gr.Dropdown(
-                label = "显存模式", 
-                info = "Normal占用25G显存，model_cpu_offload占用13G显存", 
+                label = initial_texts['model_settings']['gpu_memory_mode'], 
+                info = initial_texts['model_settings']['gpu_memory_info'], 
                 choices = ["Normal", "model_cpu_offload", "model_cpu_offload_and_qfloat8", "sequential_cpu_offload"], 
                 value = "Normal"
             )
-            teacache_threshold = gr.Slider(label="teacache threshold", info = "推荐参数0.1，0为禁用teacache加速", minimum=0, maximum=1, step=0.01, value=0)
-            num_skip_start_steps = gr.Slider(label="跳过开始步数", info = "推荐参数5", minimum=0, maximum=100, step=1, value=5)
+            teacache_threshold = gr.Slider(
+                label=initial_texts['model_settings']['teacache_threshold'], 
+                info=initial_texts['model_settings']['teacache_info'], 
+                minimum=0, maximum=1, step=0.01, value=0
+            )
+            num_skip_start_steps = gr.Slider(
+                label=initial_texts['model_settings']['num_skip_start_steps'], 
+                info=initial_texts['model_settings']['skip_steps_info'], 
+                minimum=0, maximum=100, step=1, value=5
+            )
         with gr.Row():
             clip_sample_n_frames = gr.Slider(
-                label="Clip Sample Frames", 
-                info="视频帧数，81=2秒@25fps，161=4秒@25fps，必须为4n+1", 
+                label=initial_texts['model_settings']['clip_sample_n_frames'], 
+                info=initial_texts['model_settings']['clip_frames_info'], 
                 minimum=41, 
                 maximum=321, 
                 step=4, 
                 value=81
             )
-    with gr.TabItem("StableAvatar"):
+    with gr.TabItem(initial_texts['main']['video_generation']):
         with gr.Row():
             with gr.Column():
                 with gr.Row():
-                    image_path = gr.Image(label="上传图片", type="filepath", height=280)
-                    audio_path = gr.Audio(label="上传音频", type="filepath")
-                prompt = gr.Textbox(label="提示词", value="")
-                negative_prompt = gr.Textbox(label="负面提示词", value="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走")
-                generate_button = gr.Button("🎬 开始生成", variant='primary')
-                with gr.Accordion("Parameter Settings / 参数设置", open=True):
+                    image_path = gr.Image(label=initial_texts['video_generation']['upload_image'], type="filepath", height=280)
+                    audio_path = gr.Audio(label=initial_texts['video_generation']['upload_audio'], type="filepath")
+                prompt = gr.Textbox(label=initial_texts['video_generation']['prompt'], value="")
+                negative_prompt = gr.Textbox(label=initial_texts['video_generation']['negative_prompt'], value=initial_texts['video_generation']['negative_prompt_default'])
+                generate_button = gr.Button(initial_texts['video_generation']['start_generation'], variant='primary')
+                with gr.Accordion(initial_texts['main']['model_settings'], open=True):
                     with gr.Row():
-                        width = gr.Slider(label="宽度", minimum=256, maximum=2048, step=16, value=512)
-                        height = gr.Slider(label="高度", minimum=256, maximum=2048, step=16, value=512)
+                        width = gr.Slider(label=initial_texts['video_generation']['width'], minimum=256, maximum=2048, step=16, value=512)
+                        height = gr.Slider(label=initial_texts['video_generation']['height'], minimum=256, maximum=2048, step=16, value=512)
                     with gr.Row():
-                        exchange_button = gr.Button("🔄 交换宽高")
-                        adjust_button = gr.Button("根据图片调整宽高")
+                        exchange_button = gr.Button(initial_texts['video_generation']['swap_dimensions'])
+                        adjust_button = gr.Button(initial_texts['video_generation']['adjust_size'])
                     with gr.Row():
-                        guidance_scale = gr.Slider(label="guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=6.0)
-                        num_inference_steps = gr.Slider(label="采样步数（推荐50步）", minimum=1, maximum=100, step=1, value=10)
+                        guidance_scale = gr.Slider(label=initial_texts['video_generation']['guidance_scale'], minimum=1.0, maximum=10.0, step=0.1, value=6.0)
+                        num_inference_steps = gr.Slider(label=initial_texts['video_generation']['sampling_steps'], minimum=1, maximum=100, step=1, value=10)
                     with gr.Row():
-                        text_guide_scale = gr.Slider(label="text guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=3.0)
-                        audio_guide_scale = gr.Slider(label="audio guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=5.0)
+                        text_guide_scale = gr.Slider(label=initial_texts['video_generation']['text_guide_scale'], minimum=1.0, maximum=10.0, step=0.1, value=3.0)
+                        audio_guide_scale = gr.Slider(label=initial_texts['video_generation']['audio_guide_scale'], minimum=1.0, maximum=10.0, step=0.1, value=5.0)
                     with gr.Row():
-                        motion_frame = gr.Slider(label="motion frame", minimum=1, maximum=50, step=1, value=25)
-                        fps = gr.Slider(label="帧率", minimum=1, maximum=60, step=1, value=25)
+                        motion_frame = gr.Slider(label=initial_texts['video_generation']['motion_frame'], minimum=1, maximum=50, step=1, value=25)
+                        fps = gr.Slider(label=initial_texts['video_generation']['fps'], minimum=1, maximum=60, step=1, value=25)
                     with gr.Row():
-                        overlap_window_length = gr.Slider(label="overlap window length", minimum=1, maximum=20, step=1, value=5)
-                        seed_param = gr.Number(label="种子，请输入正整数，-1为随机", value=-1)
+                        overlap_window_length = gr.Slider(label=initial_texts['video_generation']['overlap_window_length'], minimum=1, maximum=20, step=1, value=5)
+                        seed_param = gr.Number(label=initial_texts['video_generation']['seed'], value=-1)
             with gr.Column():
-                info = gr.Textbox(label="提示信息", interactive=False)
-                video_output = gr.Video(label="生成结果", interactive=False)
-                seed_output = gr.Textbox(label="种子")
-    with gr.TabItem("Audio Extraction / 音频提取"):
+                info = gr.Textbox(label=initial_texts['video_generation']['status'], interactive=False)
+                video_output = gr.Video(label=initial_texts['video_generation']['generated_result'], interactive=False)
+                seed_output = gr.Textbox(label=initial_texts['video_generation']['seed_output'])
+    with gr.TabItem(initial_texts['main']['audio_extraction']):
         with gr.Row():
             with gr.Column():
-                video_path = gr.Video(label="上传视频", height=500)
-                extractor_button = gr.Button("🎬 开始提取", variant='primary')
+                video_path = gr.Video(label=initial_texts['audio_extraction']['upload_video'], height=500)
+                extractor_button = gr.Button(initial_texts['audio_extraction']['start_extraction'], variant='primary')
             with gr.Column():
-                info2 = gr.Textbox(label="提示信息", interactive=False)
-                audio_output = gr.Audio(label="生成结果", interactive=False)
-    with gr.TabItem("Vocal Separation / 人声分离"):
+                info2 = gr.Textbox(label=initial_texts['audio_extraction']['status'], interactive=False)
+                audio_output = gr.Audio(label=initial_texts['audio_extraction']['generated_result'], interactive=False)
+    with gr.TabItem(initial_texts['main']['vocal_separation']):
         with gr.Row():
             with gr.Column():
-                audio_path3 = gr.Audio(label="上传音频", type="filepath")
-                separation_button = gr.Button("🎬 开始分离", variant='primary')
+                audio_path3 = gr.Audio(label=initial_texts['vocal_separation']['upload_audio'], type="filepath")
+                separation_button = gr.Button(initial_texts['vocal_separation']['start_separation'], variant='primary')
             with gr.Column():
-                info3 = gr.Textbox(label="提示信息", interactive=False)
-                audio_output3 = gr.Audio(label="生成结果", interactive=False)
+                info3 = gr.Textbox(label=initial_texts['vocal_separation']['status'], interactive=False)
+                audio_output3 = gr.Audio(label=initial_texts['vocal_separation']['generated_result'], interactive=False)
 
-    all_components = [GPU_memory_mode, teacache_threshold, num_skip_start_steps, clip_sample_n_frames, image_path, audio_path, prompt, negative_prompt, generate_button, width, height, exchange_button, adjust_button, guidance_scale, num_inference_steps, text_guide_scale, audio_guide_scale, motion_frame, fps, overlap_window_length, seed_param, info, video_output, seed_output, video_path, extractor_button, info2, audio_output, audio_path3, separation_button, info3, audio_output3]
+    all_components = [device_info_display, GPU_memory_mode, teacache_threshold, num_skip_start_steps, clip_sample_n_frames, image_path, audio_path, prompt, negative_prompt, generate_button, width, height, exchange_button, adjust_button, guidance_scale, num_inference_steps, text_guide_scale, audio_guide_scale, motion_frame, fps, overlap_window_length, seed_param, info, video_output, seed_output, video_path, extractor_button, info2, audio_output, audio_path3, separation_button, info3, audio_output3]
 
+    # Use the full update_language function to translate everything
     language_radio.change(
         fn=update_language,
         inputs=[language_radio],
@@ -493,4 +501,4 @@ def update_language(language):
         share=args.share, 
         mcp_server=args.mcp_server,
         inbrowser=True,
-    )
+    )
\ No newline at end of file
diff --git a/audio_extractor.py b/audio_extractor.py
old mode 100644
new mode 100755
index 1ba2061..7fd0305
--- a/audio_extractor.py
+++ b/audio_extractor.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import os
 from moviepy.editor import VideoFileClip
 import argparse
diff --git a/inference.py b/inference.py
old mode 100644
new mode 100755
index 64cb461..4bfd7d3
--- a/inference.py
+++ b/inference.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import gc
 import logging
diff --git a/inference.sh b/inference.sh
old mode 100644
new mode 100755
index 55ddae1..8c8e43c
--- a/inference.sh
+++ b/inference.sh
@@ -1,14 +1,18 @@
+#!/bin/bash
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 export TOKENIZERS_PARALLELISM=false
-export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP"
+export MODEL_NAME="$SCRIPT_DIR/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP"
 
 CUDA_VISIBLE_DEVICES=0 python inference.py \
   --config_path="deepspeed_config/wan2.1/wan_civitai.yaml" \
   --pretrained_model_name_or_path=$MODEL_NAME \
-  --transformer_path="path/StableAvatar/checkpoints/StableAvatar-1.3B/transformer3d-square.pt" \
-  --pretrained_wav2vec_path="path/StableAvatar/checkpoints/wav2vec2-base-960h" \
-  --validation_reference_path="path/StableAvatar/examples/case-1/reference.png" \
-  --validation_driven_audio_path="path/StableAvatar/examples/case-1/audio.wav" \
-  --output_dir="path/StableAvatar/output_infer" \
+  --transformer_path="$SCRIPT_DIR/checkpoints/StableAvatar-1.3B/transformer3d-square.pt" \
+  --pretrained_wav2vec_path="$SCRIPT_DIR/checkpoints/wav2vec2-base-960h" \
+  --validation_reference_path="$SCRIPT_DIR/examples/case-1/reference.png" \
+  --validation_driven_audio_path="$SCRIPT_DIR/examples/case-1/audio.wav" \
+  --output_dir="$SCRIPT_DIR/output_infer" \
   --validation_prompts="A middle-aged woman with short light brown hair, wearing pearl earrings and a blue blazer, is speaking passionately in front of a blurred background resembling a government building. Her mouth is open mid-phrase, her expression is engaged and energetic, and the lighting is bright and even, suggesting a television interview or live broadcast. The scene gives the impression she is singing with conviction and purpose." \
   --seed=42 \
   --ulysses_degree=1 \
diff --git a/lip_mask_extractor.py b/lip_mask_extractor.py
old mode 100644
new mode 100755
index a35306c..fac99c3
--- a/lip_mask_extractor.py
+++ b/lip_mask_extractor.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import os
 
diff --git a/multiple_gpu_inference.sh b/multiple_gpu_inference.sh
old mode 100644
new mode 100755
index 8d12fdd..c3446df
--- a/multiple_gpu_inference.sh
+++ b/multiple_gpu_inference.sh
@@ -1,5 +1,9 @@
+#!/bin/bash
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 export TOKENIZERS_PARALLELISM=false
-export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP"
+export MODEL_NAME="$SCRIPT_DIR/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP"
 export WORLD_SIZE=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29500
@@ -7,11 +11,11 @@ export MASTER_PORT=29500
 torchrun --nproc_per_node=4 --nnodes=1 --node_rank=0 --master_addr="localhost" --master_port=29500 inference.py \
   --config_path="deepspeed_config/wan2.1/wan_civitai.yaml" \
   --pretrained_model_name_or_path=$MODEL_NAME \
-  --transformer_path="path/StableAvatar/checkpoints/StableAvatar-1.3B/transformer3d-square.pt" \
-  --pretrained_wav2vec_path="path/StableAvatar/checkpoints/wav2vec2-base-960h" \
-  --validation_reference_path="path/StableAvatar/examples/case-1/reference.png" \
-  --validation_driven_audio_path="path/StableAvatar/examples/case-1/audio.wav" \
-  --output_dir="path/StableAvatar/output_infer" \
+  --transformer_path="$SCRIPT_DIR/checkpoints/StableAvatar-1.3B/transformer3d-square.pt" \
+  --pretrained_wav2vec_path="$SCRIPT_DIR/checkpoints/wav2vec2-base-960h" \
+  --validation_reference_path="$SCRIPT_DIR/examples/case-1/reference.png" \
+  --validation_driven_audio_path="$SCRIPT_DIR/examples/case-1/audio.wav" \
+  --output_dir="$SCRIPT_DIR/output_infer" \
   --validation_prompts="A middle-aged woman with short light brown hair, wearing pearl earrings and a blue blazer, is speaking passionately in front of a blurred background resembling a government building. Her mouth is open mid-phrase, her expression is engaged and energetic, and the lighting is bright and even, suggesting a television interview or live broadcast. The scene gives the impression she is singing with conviction and purpose." \
   --seed=42 \
   --ulysses_degree=2 \
diff --git a/requirements.txt b/requirements.txt
index 2f69e17..4140b68 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,37 +1,70 @@
+# PyTorch installation notes:
+# For CUDA 12.4: pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu124
+# For CUDA 12.8: pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128
+# The version ranges below support both configurations
+#
+# Installation order optimized for efficiency:
+# 1. Build tools and core dependencies first
+# 2. PyTorch ecosystem (large packages that others depend on)
+# 3. Computer vision and image processing
+# 4. ML/AI frameworks
+# 5. Audio/video processing
+# 6. Utilities and other packages
+#
+# Removed packages that are dependencies of others:
+# - tokenizers (dependency of transformers)
+# - imageio-ffmpeg (dependency of imageio[ffmpeg])
+#
+# Optional packages (for enhanced performance/features):
+# - flash-attn (for faster attention computation, install with: pip install flash-attn)
+# - xformers (for memory efficient attention, install with: pip install xformers)
+# - bitsandbytes (for 8-bit training, install with: pip install bitsandbytes)
+# - audio-separator[gpu] (for vocal separation, install with: pip install audio-separator[gpu])
+# - decord (for faster video reading, install with: pip install decord)
+#   Note: decord is not available on macOS, falls back to torchvision automatically
+
+# Build tools and core dependencies
 ninja
-Pillow
-einops
+numpy>=1.23.5,<2
+tqdm
+easydict
+omegaconf
 safetensors
-timm
-tomesd
-torch==2.7.0
+
+# PyTorch ecosystem (install first as other packages depend on these)
+torch>=2.6.0,<=2.7.0
+torchvision>=0.21.0,<=0.22.0
+torchaudio>=2.1.1,<=2.7.0
 torchdiffeq
 torchsde
-decord
-datasets
-torchvision==0.22.0
+
+# Computer vision and image processing
+Pillow
 opencv-python>=4.9.0.80
-diffusers==0.30.1
-transformers==4.51.3
-tokenizers>=0.20.3
-accelerate>=1.1.1
-tqdm
-easydict
-ftfy
-dashscope
-imageio-ffmpeg
-gradio>=5.0.0
-numpy>=1.23.5,<2
 scikit-image
-opencv-python
-omegaconf
-SentencePiece
 albumentations
 imageio[ffmpeg]
 imageio[pyav]
-tensorboard
-beautifulsoup4
+
+# ML/AI frameworks and transformers
+transformers==4.51.3
+diffusers==0.30.1
+accelerate>=1.1.1
+timm
+tomesd
+einops
 ftfy
+SentencePiece
+dashscope
+
+# Audio and video processing
 librosa
-torchaudio==2.7.0
 moviepy==1.0.3
+
+# Data handling and utilities
+datasets
+tensorboard
+beautifulsoup4
+
+# Web interface
+gradio>=5.0.0
diff --git a/train_14B.py b/train_14B.py
old mode 100644
new mode 100755
index 4835a7b..fb18430
--- a/train_14B.py
+++ b/train_14B.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import gc
 import logging
diff --git a/train_14B.sh b/train_14B.sh
old mode 100644
new mode 100755
index 77dcf60..c4fd6cd
--- a/train_14B.sh
+++ b/train_14B.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 export TOKENIZERS_PARALLELISM=false
 export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-I2V-14B-480P"
 
diff --git a/train_14B_lora.py b/train_14B_lora.py
old mode 100644
new mode 100755
index f6ef6bd..a7a60c8
--- a/train_14B_lora.py
+++ b/train_14B_lora.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import gc
 import logging
diff --git a/train_14B_lora.sh b/train_14B_lora.sh
old mode 100644
new mode 100755
index e3b2700..b10e7b8
--- a/train_14B_lora.sh
+++ b/train_14B_lora.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 export TOKENIZERS_PARALLELISM=false
 export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-I2V-14B-480P"
 
diff --git a/train_1B_rec_vec.py b/train_1B_rec_vec.py
old mode 100644
new mode 100755
index b4ad332..5f7199a
--- a/train_1B_rec_vec.py
+++ b/train_1B_rec_vec.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import gc
 import logging
diff --git a/train_1B_rec_vec.sh b/train_1B_rec_vec.sh
old mode 100644
new mode 100755
index c0a916b..c970199
--- a/train_1B_rec_vec.sh
+++ b/train_1B_rec_vec.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 export TOKENIZERS_PARALLELISM=false
 export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP"
 export NCCL_IB_DISABLE=1
diff --git a/train_1B_rec_vec_64.sh b/train_1B_rec_vec_64.sh
old mode 100644
new mode 100755
index 9aa709b..3d30898
--- a/train_1B_rec_vec_64.sh
+++ b/train_1B_rec_vec_64.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 export TOKENIZERS_PARALLELISM=false
 export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP"
 
diff --git a/train_1B_rec_vec_lora.py b/train_1B_rec_vec_lora.py
old mode 100644
new mode 100755
index 3dbb92d..b770d49
--- a/train_1B_rec_vec_lora.py
+++ b/train_1B_rec_vec_lora.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import gc
 import logging
diff --git a/train_1B_rec_vec_lora.sh b/train_1B_rec_vec_lora.sh
old mode 100644
new mode 100755
index 7f368bd..daf1419
--- a/train_1B_rec_vec_lora.sh
+++ b/train_1B_rec_vec_lora.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 export TOKENIZERS_PARALLELISM=false
 export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP"
 export NCCL_IB_DISABLE=1
diff --git a/train_1B_rec_vec_lora_64.sh b/train_1B_rec_vec_lora_64.sh
old mode 100644
new mode 100755
index 5a9f4d0..6919e49
--- a/train_1B_rec_vec_lora_64.sh
+++ b/train_1B_rec_vec_lora_64.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 export TOKENIZERS_PARALLELISM=false
 export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP"
 
diff --git a/train_1B_square.py b/train_1B_square.py
old mode 100644
new mode 100755
index f5aeb21..b0e5a90
--- a/train_1B_square.py
+++ b/train_1B_square.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import gc
 import logging
diff --git a/train_1B_square.sh b/train_1B_square.sh
old mode 100644
new mode 100755
index eb7d5a3..ce4bb57
--- a/train_1B_square.sh
+++ b/train_1B_square.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 export TOKENIZERS_PARALLELISM=false
 export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP"
 export NCCL_IB_DISABLE=1
diff --git a/train_1B_square_64.sh b/train_1B_square_64.sh
old mode 100644
new mode 100755
index 6280cf3..9cd99b4
--- a/train_1B_square_64.sh
+++ b/train_1B_square_64.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 export TOKENIZERS_PARALLELISM=false
 export MODEL_NAME="path/StableAvatar/checkpoints/Wan2.1-Fun-V1.1-1.3B-InP"
 
diff --git a/vocal_seperator.py b/vocal_seperator.py
old mode 100644
new mode 100755
index 3ecdb84..ed1be3d
--- a/vocal_seperator.py
+++ b/vocal_seperator.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import os
 import shutil
diff --git a/wan/__init__.py b/wan/__init__.py
old mode 100644
new mode 100755
index 62b57c6..5db04d7
--- a/wan/__init__.py
+++ b/wan/__init__.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # from . import configs, distributed, modules
 # from .image2video import WanI2V
 # from .text2video import WanT2V
diff --git a/wan/configs/__init__.py b/wan/configs/__init__.py
old mode 100644
new mode 100755
index c72d2d0..5d23be8
--- a/wan/configs/__init__.py
+++ b/wan/configs/__init__.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import copy
 import os
diff --git a/wan/configs/shared_config.py b/wan/configs/shared_config.py
old mode 100644
new mode 100755
index 04a9f45..0a8de04
--- a/wan/configs/shared_config.py
+++ b/wan/configs/shared_config.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import torch
 from easydict import EasyDict
diff --git a/wan/configs/wan_i2v_14B.py b/wan/configs/wan_i2v_14B.py
old mode 100644
new mode 100755
index 12e8e20..3ff0c23
--- a/wan/configs/wan_i2v_14B.py
+++ b/wan/configs/wan_i2v_14B.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import torch
 from easydict import EasyDict
diff --git a/wan/configs/wan_t2v_14B.py b/wan/configs/wan_t2v_14B.py
old mode 100644
new mode 100755
index 9d0ee69..25d4206
--- a/wan/configs/wan_t2v_14B.py
+++ b/wan/configs/wan_t2v_14B.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 from easydict import EasyDict
 
diff --git a/wan/configs/wan_t2v_1_3B.py b/wan/configs/wan_t2v_1_3B.py
old mode 100644
new mode 100755
index ea9502b..68ad709
--- a/wan/configs/wan_t2v_1_3B.py
+++ b/wan/configs/wan_t2v_1_3B.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 from easydict import EasyDict
 
diff --git a/wan/dataset/talking_video_dataset_fantasy.py b/wan/dataset/talking_video_dataset_fantasy.py
old mode 100644
new mode 100755
index b23b796..8569c9c
--- a/wan/dataset/talking_video_dataset_fantasy.py
+++ b/wan/dataset/talking_video_dataset_fantasy.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import math
 import os
 import random
diff --git a/wan/dist/__init__.py b/wan/dist/__init__.py
old mode 100644
new mode 100755
index 8da6edf..825b5d4
--- a/wan/dist/__init__.py
+++ b/wan/dist/__init__.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import torch
 import torch.distributed as dist
 
diff --git a/wan/dist/wan_xfuser.py b/wan/dist/wan_xfuser.py
old mode 100644
new mode 100755
index 9e02bf8..090f3c2
--- a/wan/dist/wan_xfuser.py
+++ b/wan/dist/wan_xfuser.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import torch
 import torch.amp as amp
 
diff --git a/wan/distributed/__init__.py b/wan/distributed/__init__.py
old mode 100644
new mode 100755
diff --git a/wan/distributed/fsdp.py b/wan/distributed/fsdp.py
old mode 100644
new mode 100755
index 18ba2f3..524f8d1
--- a/wan/distributed/fsdp.py
+++ b/wan/distributed/fsdp.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import gc
 from functools import partial
diff --git a/wan/distributed/xdit_context_parallel.py b/wan/distributed/xdit_context_parallel.py
old mode 100644
new mode 100755
index 01936ce..2da7a5e
--- a/wan/distributed/xdit_context_parallel.py
+++ b/wan/distributed/xdit_context_parallel.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import torch
 import torch.cuda.amp as amp
diff --git a/wan/image2video.py b/wan/image2video.py
old mode 100644
new mode 100755
index b375fb9..2f771a6
--- a/wan/image2video.py
+++ b/wan/image2video.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import gc
 import logging
diff --git a/wan/models/__init__.py b/wan/models/__init__.py
old mode 100644
new mode 100755
diff --git a/wan/models/attention_processor.py b/wan/models/attention_processor.py
old mode 100644
new mode 100755
index 23029b1..823c628
--- a/wan/models/attention_processor.py
+++ b/wan/models/attention_processor.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import inspect
 import math
 from typing import Callable, List, Optional, Tuple, Union
diff --git a/wan/models/cache_utils.py b/wan/models/cache_utils.py
old mode 100644
new mode 100755
index d55d87f..ee520a1
--- a/wan/models/cache_utils.py
+++ b/wan/models/cache_utils.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import numpy as np
 import torch
 
diff --git a/wan/models/motion_controller.py b/wan/models/motion_controller.py
old mode 100644
new mode 100755
index 9529bef..b430a51
--- a/wan/models/motion_controller.py
+++ b/wan/models/motion_controller.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import torch
 import torch.nn as nn
 
diff --git a/wan/models/motion_to_bucket.py b/wan/models/motion_to_bucket.py
old mode 100644
new mode 100755
index 8425b45..48e2b1b
--- a/wan/models/motion_to_bucket.py
+++ b/wan/models/motion_to_bucket.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import torch
 from diffusers import ModelMixin
 from einops import rearrange
diff --git a/wan/models/vocal_projector_fantasy.py b/wan/models/vocal_projector_fantasy.py
old mode 100644
new mode 100755
index 921a0f7..7670c1d
--- a/wan/models/vocal_projector_fantasy.py
+++ b/wan/models/vocal_projector_fantasy.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import os
 import torch
 import torch.nn as nn
diff --git a/wan/models/vocal_projector_fantasy_14B.py b/wan/models/vocal_projector_fantasy_14B.py
old mode 100644
new mode 100755
index 52eebf4..e8f8c89
--- a/wan/models/vocal_projector_fantasy_14B.py
+++ b/wan/models/vocal_projector_fantasy_14B.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import os
 import warnings
 
diff --git a/wan/models/vocal_projector_fantasy_1B.py b/wan/models/vocal_projector_fantasy_1B.py
old mode 100644
new mode 100755
index 9f6eb31..bb0e802
--- a/wan/models/vocal_projector_fantasy_1B.py
+++ b/wan/models/vocal_projector_fantasy_1B.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import os
 import warnings
 
diff --git a/wan/models/wan_fantasy_transformer3d_14B.py b/wan/models/wan_fantasy_transformer3d_14B.py
old mode 100644
new mode 100755
index 0eb6511..1935c55
--- a/wan/models/wan_fantasy_transformer3d_14B.py
+++ b/wan/models/wan_fantasy_transformer3d_14B.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/model.py
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 
diff --git a/wan/models/wan_fantasy_transformer3d_1B.py b/wan/models/wan_fantasy_transformer3d_1B.py
old mode 100644
new mode 100755
index 869e5f7..2eb8533
--- a/wan/models/wan_fantasy_transformer3d_1B.py
+++ b/wan/models/wan_fantasy_transformer3d_1B.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/model.py
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 
diff --git a/wan/models/wan_image_encoder.py b/wan/models/wan_image_encoder.py
old mode 100644
new mode 100755
index 950b4cc..9ab8187
--- a/wan/models/wan_image_encoder.py
+++ b/wan/models/wan_image_encoder.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Modified from ``https://github.com/openai/CLIP'' and ``https://github.com/mlfoundations/open_clip''
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import math
diff --git a/wan/models/wan_text_encoder.py b/wan/models/wan_text_encoder.py
old mode 100644
new mode 100755
index 34a0323..f5b7f8f
--- a/wan/models/wan_text_encoder.py
+++ b/wan/models/wan_text_encoder.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/t5.py
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import math
diff --git a/wan/models/wan_transformer3d.py b/wan/models/wan_transformer3d.py
old mode 100644
new mode 100755
index 3507488..31ceb12
--- a/wan/models/wan_transformer3d.py
+++ b/wan/models/wan_transformer3d.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/model.py
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 
diff --git a/wan/models/wan_vae.py b/wan/models/wan_vae.py
old mode 100644
new mode 100755
index 4afb122..5cf8397
--- a/wan/models/wan_vae.py
+++ b/wan/models/wan_vae.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/vae.py
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 from typing import Tuple, Union
diff --git a/wan/models/wan_xlm_roberta.py b/wan/models/wan_xlm_roberta.py
old mode 100644
new mode 100755
index 755baf3..edc7045
--- a/wan/models/wan_xlm_roberta.py
+++ b/wan/models/wan_xlm_roberta.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Modified from transformers.models.xlm_roberta.modeling_xlm_roberta
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import torch
diff --git a/wan/models/wav2vec.py b/wan/models/wav2vec.py
old mode 100644
new mode 100755
index 5c2fad8..b328ad3
--- a/wan/models/wav2vec.py
+++ b/wan/models/wav2vec.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 This module defines the Wav2Vec model, which is a pre-trained model for speech recognition and understanding.
 It inherits from the Wav2Vec2Model class in the transformers library and provides additional functionalities
diff --git a/wan/pipeline/__init__.py b/wan/pipeline/__init__.py
old mode 100644
new mode 100755
diff --git a/wan/pipeline/pipeline_wan_fun_inpaint.py b/wan/pipeline/pipeline_wan_fun_inpaint.py
old mode 100644
new mode 100755
index 70dd114..568254d
--- a/wan/pipeline/pipeline_wan_fun_inpaint.py
+++ b/wan/pipeline/pipeline_wan_fun_inpaint.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import inspect
 import math
 from dataclasses import dataclass
diff --git a/wan/pipeline/wan_inference_long_pipeline.py b/wan/pipeline/wan_inference_long_pipeline.py
old mode 100644
new mode 100755
index 54c456b..4f7d7b7
--- a/wan/pipeline/wan_inference_long_pipeline.py
+++ b/wan/pipeline/wan_inference_long_pipeline.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import inspect
 import math
 import random
diff --git a/wan/pipeline/wan_inference_pipeline_fantasy.py b/wan/pipeline/wan_inference_pipeline_fantasy.py
old mode 100644
new mode 100755
index b89d4c6..859c8b2
--- a/wan/pipeline/wan_inference_pipeline_fantasy.py
+++ b/wan/pipeline/wan_inference_pipeline_fantasy.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import inspect
 import math
 from dataclasses import dataclass
diff --git a/wan/text2video.py b/wan/text2video.py
old mode 100644
new mode 100755
index 2400545..d66dbb2
--- a/wan/text2video.py
+++ b/wan/text2video.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import gc
 import logging
diff --git a/wan/utils/__init__.py b/wan/utils/__init__.py
old mode 100644
new mode 100755
index 6e9a339..820bf01
--- a/wan/utils/__init__.py
+++ b/wan/utils/__init__.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 from .fm_solvers import (FlowDPMSolverMultistepScheduler, get_sampling_sigmas,
                          retrieve_timesteps)
 from .fm_solvers_unipc import FlowUniPCMultistepScheduler
diff --git a/wan/utils/color_correction.py b/wan/utils/color_correction.py
old mode 100644
new mode 100755
index 83bef44..ef44906
--- a/wan/utils/color_correction.py
+++ b/wan/utils/color_correction.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import torch
 import numpy as np
 from skimage import color
diff --git a/wan/utils/discrete_sampler.py b/wan/utils/discrete_sampler.py
old mode 100644
new mode 100755
index 47f3557..a281fb1
--- a/wan/utils/discrete_sampler.py
+++ b/wan/utils/discrete_sampler.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """Modified from https://github.com/THUDM/CogVideo/blob/3710a612d8760f5cdb1741befeebb65b9e0f2fe0/sat/sgm/modules/diffusionmodules/sigma_sampling.py
 """
 import torch
diff --git a/wan/utils/fm_solvers.py b/wan/utils/fm_solvers.py
old mode 100644
new mode 100755
index c908969..7a2e07f
--- a/wan/utils/fm_solvers.py
+++ b/wan/utils/fm_solvers.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
 # Convert dpm solver for flow matching
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
diff --git a/wan/utils/fm_solvers_unipc.py b/wan/utils/fm_solvers_unipc.py
old mode 100644
new mode 100755
index 57321ba..ea810ba
--- a/wan/utils/fm_solvers_unipc.py
+++ b/wan/utils/fm_solvers_unipc.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py
 # Convert unipc for flow matching
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
diff --git a/wan/utils/fp8_optimization.py b/wan/utils/fp8_optimization.py
old mode 100644
new mode 100755
index cf77946..99dc72a
--- a/wan/utils/fp8_optimization.py
+++ b/wan/utils/fp8_optimization.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """Modified from https://github.com/kijai/ComfyUI-MochiWrapper
 """
 import torch
diff --git a/wan/utils/language_utils.py b/wan/utils/language_utils.py
new file mode 100755
index 0000000..335f4a0
--- /dev/null
+++ b/wan/utils/language_utils.py
@@ -0,0 +1,724 @@
+#!/usr/bin/env python3
+"""
+Language detection and localization utilities for StableAvatar.
+Provides browser language detection and interface localization.
+"""
+
+import re
+from typing import Dict, Any, Optional, List
+
+
+def detect_browser_language(accept_language_header: str) -> str:
+    """
+    Detect the preferred language from browser Accept-Language header.
+    
+    Args:
+        accept_language_header: The Accept-Language header from the browser
+        
+    Returns:
+        str: Detected language code ('zh' for Chinese, 'en' for English, 'es' for Spanish, 'de' for German, 'ja' for Japanese, 'fr' for French, 'pt' for Portuguese, 'ru' for Russian, default 'zh')
+    """
+    if not accept_language_header:
+        return "zh"  # Default to Chinese
+    
+    # Parse Accept-Language header (e.g., "zh-CN,zh;q=0.9,en;q=0.8")
+    languages = []
+    for lang_part in accept_language_header.split(','):
+        lang_part = lang_part.strip()
+        if ';' in lang_part:
+            lang, quality = lang_part.split(';', 1)
+            quality = float(quality.split('=')[1]) if 'q=' in quality else 1.0
+        else:
+            lang = lang_part
+            quality = 1.0
+        
+        # Extract language code (e.g., "zh-CN" -> "zh", "en-US" -> "en")
+        lang_code = lang.split('-')[0].lower()
+        languages.append((lang_code, quality))
+    
+    # Sort by quality (higher first)
+    languages.sort(key=lambda x: x[1], reverse=True)
+    
+    # Check for supported languages in order of preference
+    for lang_code, _ in languages:
+        if lang_code in ['zh', 'zh-cn', 'zh-tw', 'zh-hk']:
+            return "zh"
+        elif lang_code in ['en', 'en-us', 'en-gb', 'en-ca', 'en-au']:
+            return "en"
+        elif lang_code in ['es', 'es-es', 'es-mx', 'es-ar', 'es-co', 'es-pe', 'es-ve', 'es-cl', 'es-ec', 'es-gt', 'es-cu', 'es-bo', 'es-do', 'es-hn', 'es-py', 'es-sv', 'es-ni', 'es-cr', 'es-pa', 'es-pr', 'es-uy']:
+            return "es"
+        elif lang_code in ['de', 'de-de', 'de-at', 'de-ch', 'de-li', 'de-lu', 'de-be']:
+            return "de"
+        elif lang_code in ['ja', 'ja-jp']:
+            return "ja"
+        elif lang_code in ['fr', 'fr-fr', 'fr-ca', 'fr-be', 'fr-ch', 'fr-lu', 'fr-mc', 'fr-sn', 'fr-ci', 'fr-cm', 'fr-mg', 'fr-cd', 'fr-dj', 'fr-gn', 'fr-ml', 'fr-ne', 'fr-rw', 'fr-td', 'fr-tg', 'fr-bf', 'fr-bi', 'fr-km', 'fr-cf', 'fr-ga', 'fr-gq', 'fr-mr', 'fr-vu', 'fr-nc', 'fr-pf', 'fr-wf', 'fr-yt']:
+            return "fr"
+        elif lang_code in ['pt', 'pt-br', 'pt-pt', 'pt-ao', 'pt-mz', 'pt-cv', 'pt-gw', 'pt-st', 'pt-tl']:
+            return "pt"
+        elif lang_code in ['ru', 'ru-ru', 'ru-by', 'ru-kz', 'ru-kg', 'ru-tj', 'ru-tm', 'ru-uz', 'ru-md', 'ru-ua', 'ru-am', 'ru-az', 'ru-ge']:
+            return "ru"
+    
+    # Default to Chinese if no match
+    return "zh"
+
+
+def get_language_from_request(request) -> str:
+    """
+    Extract language preference from a Gradio request object.
+    
+    Args:
+        request: Gradio request object
+        
+    Returns:
+        str: Detected language ('zh' or 'en')
+    """
+    try:
+        # Try to get Accept-Language header
+        accept_language = request.headers.get('Accept-Language', '')
+        return detect_browser_language(accept_language)
+    except:
+        return "zh"  # Default fallback
+
+
+def get_interface_texts(language: str) -> Dict[str, Dict[str, str]]:
+    """
+    Get interface texts for the specified language.
+    
+    Args:
+        language: Language code ('zh', 'en', 'es', 'de', 'ja', 'fr', 'pt', or 'ru')
+        
+    Returns:
+        Dict containing all interface texts
+    """
+    if language == "en":
+        return {
+            "main": {
+                "title": "StableAvatar",
+                "device_info": "Running on: {device_summary} | Device: {device} | Data Type: {dtype}",
+                "language_label": "Language",
+                "model_settings": "Model Settings",
+                "video_generation": "Video Generation",
+                "audio_extraction": "Audio Extraction", 
+                "vocal_separation": "Vocal Separation"
+            },
+            "model_settings": {
+                "gpu_memory_mode": "GPU Memory Mode",
+                "gpu_memory_info": "Normal uses 25G VRAM, model_cpu_offload uses 13G VRAM",
+                "teacache_threshold": "TeaCache Threshold",
+                "teacache_info": "Recommended 0.1, 0 disables TeaCache acceleration",
+                "num_skip_start_steps": "Skip Start Steps",
+                "skip_steps_info": "Recommended 5",
+                "clip_sample_n_frames": "Clip Sample Frames",
+                "clip_frames_info": "Video frames, 81=2s@25fps, 161=4s@25fps, must be 4n+1",
+                "model_selection": "Transformer Model",
+                "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)",
+                "model_selection": "Transformer Model",
+                "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)",
+                "model_selection": "Transformer Model",
+                "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)",
+                "model_selection": "Transformer Model",
+                "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)",
+                "model_selection": "Transformer Model",
+                "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)",
+                "model_selection": "Transformer Model",
+                "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)",
+                "model_selection": "Transformer Model",
+                "model_selection_info": "Choose the transformer model type: Square (standard) or Rec-Vec (recommended)"
+            },
+            "video_generation": {
+                "upload_image": "Upload Image",
+                "upload_audio": "Upload Audio",
+                "prompt": "Prompt",
+                "negative_prompt": "Negative Prompt",
+                "negative_prompt_default": "vivid colors, overexposed, static, blurry details, subtitles, style, artwork, painting, still image, overall gray, worst quality, low quality, JPEG compression artifacts, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn face, deformed, disfigured, malformed limbs, fused fingers, static image, cluttered background, three legs, many people in background, walking backwards",
+                "start_generation": "🎬 Start Generation",
+                "width": "Width",
+                "height": "Height",
+                "swap_dimensions": "🔄 Swap Width/Height",
+                "adjust_size": "Adjust Size Based on Image",
+                "guidance_scale": "Guidance Scale",
+                "sampling_steps": "Sampling Steps (Recommended 50)",
+                "text_guide_scale": "Text Guidance Scale",
+                "audio_guide_scale": "Audio Guidance Scale",
+                "motion_frame": "Motion Frame",
+                "fps": "FPS",
+                "overlap_window_length": "Overlap Window Length",
+                "seed": "Seed (positive integer, -1 for random)",
+                "status": "Status",
+                "generated_result": "Generated Result",
+                "seed_output": "Seed"
+            },
+            "audio_extraction": {
+                "upload_video": "Upload Video",
+                "start_extraction": "🎬 Start Extraction",
+                "status": "Status",
+                "generated_result": "Generated Result"
+            },
+            "vocal_separation": {
+                "upload_audio": "Upload Audio",
+                "start_separation": "🎬 Start Separation",
+                "status": "Status",
+                "generated_result": "Generated Result"
+            }
+        }
+    elif language == "es":  # Spanish
+        return {
+            "main": {
+                "title": "StableAvatar",
+                "device_info": "Ejecutando en: {device_summary} | Dispositivo: {device} | Tipo de datos: {dtype}",
+                "language_label": "Idioma",
+                "model_settings": "Configuración del Modelo",
+                "video_generation": "Generación de Video",
+                "audio_extraction": "Extracción de Audio", 
+                "vocal_separation": "Separación Vocal"
+            },
+            "model_settings": {
+                "gpu_memory_mode": "Modo de Memoria GPU",
+                "gpu_memory_info": "Normal usa 25G VRAM, model_cpu_offload usa 13G VRAM",
+                "teacache_threshold": "Umbral TeaCache",
+                "teacache_info": "Recomendado 0.1, 0 desactiva la aceleración TeaCache",
+                "num_skip_start_steps": "Omitir Pasos Iniciales",
+                "skip_steps_info": "Recomendado 5",
+                "clip_sample_n_frames": "Frames de Muestra Clip",
+                "clip_frames_info": "Frames de video, 81=2s@25fps, 161=4s@25fps, debe ser 4n+1",
+                "model_selection": "Modelo Transformer",
+                "model_selection_info": "Elige el tipo de modelo transformer: Square (estándar) o Rec-Vec (recomendado)"
+            },
+            "video_generation": {
+                "upload_image": "Subir Imagen",
+                "upload_audio": "Subir Audio",
+                "prompt": "Prompt",
+                "negative_prompt": "Prompt Negativo",
+                "negative_prompt_default": "colores vivos, sobreexpuesto, estático, detalles borrosos, subtítulos, estilo, obra de arte, pintura, imagen fija, gris general, peor calidad, baja calidad, artefactos de compresión JPEG, feo, incompleto, dedos extra, manos mal dibujadas, cara mal dibujada, deforme, desfigurado, extremidades malformadas, dedos fusionados, imagen estática, fondo desordenado, tres piernas, mucha gente en el fondo, caminando hacia atrás",
+                "start_generation": "🎬 Iniciar Generación",
+                "width": "Ancho",
+                "height": "Alto",
+                "swap_dimensions": "🔄 Intercambiar Ancho/Alto",
+                "adjust_size": "Ajustar Tamaño Basado en Imagen",
+                "guidance_scale": "Escala de Guía",
+                "sampling_steps": "Pasos de Muestreo (Recomendado 50)",
+                "text_guide_scale": "Escala de Guía de Texto",
+                "audio_guide_scale": "Escala de Guía de Audio",
+                "motion_frame": "Frame de Movimiento",
+                "fps": "FPS",
+                "overlap_window_length": "Longitud de Ventana de Solapamiento",
+                "seed": "Semilla (entero positivo, -1 para aleatorio)",
+                "status": "Estado",
+                "generated_result": "Resultado Generado",
+                "seed_output": "Semilla"
+            },
+            "audio_extraction": {
+                "upload_video": "Subir Video",
+                "start_extraction": "🎬 Iniciar Extracción",
+                "status": "Estado",
+                "generated_result": "Resultado Generado"
+            },
+            "vocal_separation": {
+                "upload_audio": "Subir Audio",
+                "start_separation": "🎬 Iniciar Separación",
+                "status": "Estado",
+                "generated_result": "Resultado Generado"
+            }
+        }
+    elif language == "de":  # German
+        return {
+            "main": {
+                "title": "StableAvatar",
+                "device_info": "Läuft auf: {device_summary} | Gerät: {device} | Datentyp: {dtype}",
+                "language_label": "Sprache",
+                "model_settings": "Modelleinstellungen",
+                "video_generation": "Video-Generierung",
+                "audio_extraction": "Audio-Extraktion", 
+                "vocal_separation": "Gesangstrennung"
+            },
+            "model_settings": {
+                "gpu_memory_mode": "GPU-Speichermodus",
+                "gpu_memory_info": "Normal verwendet 25G VRAM, model_cpu_offload verwendet 13G VRAM",
+                "teacache_threshold": "TeaCache-Schwellenwert",
+                "teacache_info": "Empfohlen 0.1, 0 deaktiviert TeaCache-Beschleunigung",
+                "num_skip_start_steps": "Startschritte Überspringen",
+                "skip_steps_info": "Empfohlen 5",
+                "clip_sample_n_frames": "Clip-Sample-Frames",
+                "clip_frames_info": "Video-Frames, 81=2s@25fps, 161=4s@25fps, muss 4n+1 sein",
+                "model_selection": "Transformer-Modell",
+                "model_selection_info": "Wählen Sie den Transformer-Modelltyp: Square (Standard) oder Rec-Vec (empfohlen)"
+            },
+            "video_generation": {
+                "upload_image": "Bild Hochladen",
+                "upload_audio": "Audio Hochladen",
+                "prompt": "Prompt",
+                "negative_prompt": "Negativer Prompt",
+                "negative_prompt_default": "lebendige Farben, überbelichtet, statisch, unscharfe Details, Untertitel, Stil, Kunstwerk, Gemälde, Standbild, insgesamt grau, schlechteste Qualität, niedrige Qualität, JPEG-Komprimierungsartefakte, hässlich, unvollständig, zusätzliche Finger, schlecht gezeichnete Hände, schlecht gezeichnetes Gesicht, deformiert, entstellt, missgebildete Gliedmaßen, verschmolzene Finger, statisches Bild, unordentlicher Hintergrund, drei Beine, viele Menschen im Hintergrund, rückwärts gehend",
+                "start_generation": "🎬 Generierung Starten",
+                "width": "Breite",
+                "height": "Höhe",
+                "swap_dimensions": "🔄 Breite/Höhe Tauschen",
+                "adjust_size": "Größe Basierend auf Bild Anpassen",
+                "guidance_scale": "Führungsskala",
+                "sampling_steps": "Sampling-Schritte (Empfohlen 50)",
+                "text_guide_scale": "Text-Führungsskala",
+                "audio_guide_scale": "Audio-Führungsskala",
+                "motion_frame": "Bewegungsframe",
+                "fps": "FPS",
+                "overlap_window_length": "Überlappungsfenster-Länge",
+                "seed": "Seed (positive Ganzzahl, -1 für zufällig)",
+                "status": "Status",
+                "generated_result": "Generiertes Ergebnis",
+                "seed_output": "Seed"
+            },
+            "audio_extraction": {
+                "upload_video": "Video Hochladen",
+                "start_extraction": "🎬 Extraktion Starten",
+                "status": "Status",
+                "generated_result": "Generiertes Ergebnis"
+            },
+            "vocal_separation": {
+                "upload_audio": "Audio Hochladen",
+                "start_separation": "🎬 Trennung Starten",
+                "status": "Status",
+                "generated_result": "Generiertes Ergebnis"
+            }
+        }
+    elif language == "ja":  # Japanese
+        return {
+            "main": {
+                "title": "StableAvatar",
+                "device_info": "実行環境: {device_summary} | デバイス: {device} | データ型: {dtype}",
+                "language_label": "言語",
+                "model_settings": "モデル設定",
+                "video_generation": "動画生成",
+                "audio_extraction": "音声抽出", 
+                "vocal_separation": "ボーカル分離"
+            },
+            "model_settings": {
+                "gpu_memory_mode": "GPUメモリモード",
+                "gpu_memory_info": "Normalは25G VRAM、model_cpu_offloadは13G VRAMを使用",
+                "teacache_threshold": "TeaCache閾値",
+                "teacache_info": "推奨値0.1、0でTeaCache加速を無効化",
+                "num_skip_start_steps": "開始ステップをスキップ",
+                "skip_steps_info": "推奨値5",
+                "clip_sample_n_frames": "Clipサンプルフレーム",
+                "clip_frames_info": "動画フレーム、81=2秒@25fps、161=4秒@25fps、4n+1である必要があります",
+                "model_selection": "Transformerモデル",
+                "model_selection_info": "Transformerモデルタイプを選択: Square（標準）またはRec-Vec（推奨）"
+            },
+            "video_generation": {
+                "upload_image": "画像をアップロード",
+                "upload_audio": "音声をアップロード",
+                "prompt": "プロンプト",
+                "negative_prompt": "ネガティブプロンプト",
+                "negative_prompt_default": "鮮やかな色、露出オーバー、静止、ぼやけた詳細、字幕、スタイル、アートワーク、絵画、静止画像、全体的にグレー、最悪の品質、低品質、JPEG圧縮アーティファクト、醜い、不完全、余分な指、不適切に描かれた手、不適切に描かれた顔、変形、破損、奇形の手足、融合した指、静止画像、乱雑な背景、3本足、背景に多くの人、後ろ向きに歩く",
+                "start_generation": "🎬 生成開始",
+                "width": "幅",
+                "height": "高さ",
+                "swap_dimensions": "🔄 幅/高さを交換",
+                "adjust_size": "画像に基づいてサイズを調整",
+                "guidance_scale": "ガイダンススケール",
+                "sampling_steps": "サンプリングステップ（推奨50）",
+                "text_guide_scale": "テキストガイダンススケール",
+                "audio_guide_scale": "音声ガイダンススケール",
+                "motion_frame": "モーションフレーム",
+                "fps": "FPS",
+                "overlap_window_length": "オーバーラップウィンドウ長",
+                "seed": "シード（正の整数、-1でランダム）",
+                "status": "ステータス",
+                "generated_result": "生成結果",
+                "seed_output": "シード"
+            },
+            "audio_extraction": {
+                "upload_video": "動画をアップロード",
+                "start_extraction": "🎬 抽出開始",
+                "status": "ステータス",
+                "generated_result": "生成結果"
+            },
+            "vocal_separation": {
+                "upload_audio": "音声をアップロード",
+                "start_separation": "🎬 分離開始",
+                "status": "ステータス",
+                "generated_result": "生成結果"
+            }
+        }
+    elif language == "fr":  # French
+        return {
+            "main": {
+                "title": "StableAvatar",
+                "device_info": "Exécution sur: {device_summary} | Appareil: {device} | Type de données: {dtype}",
+                "language_label": "Langue",
+                "model_settings": "Paramètres du Modèle",
+                "video_generation": "Génération Vidéo",
+                "audio_extraction": "Extraction Audio", 
+                "vocal_separation": "Séparation Vocale"
+            },
+            "model_settings": {
+                "gpu_memory_mode": "Mode Mémoire GPU",
+                "gpu_memory_info": "Normal utilise 25G VRAM, model_cpu_offload utilise 13G VRAM",
+                "teacache_threshold": "Seuil TeaCache",
+                "teacache_info": "Recommandé 0.1, 0 désactive l'accélération TeaCache",
+                "num_skip_start_steps": "Ignorer les Étapes Initiales",
+                "skip_steps_info": "Recommandé 5",
+                "clip_sample_n_frames": "Images d'Échantillon Clip",
+                "clip_frames_info": "Images vidéo, 81=2s@25fps, 161=4s@25fps, doit être 4n+1",
+                "model_selection": "Modèle Transformer",
+                "model_selection_info": "Choisissez le type de modèle transformer: Square (standard) ou Rec-Vec (recommandé)"
+            },
+            "video_generation": {
+                "upload_image": "Télécharger Image",
+                "upload_audio": "Télécharger Audio",
+                "prompt": "Prompt",
+                "negative_prompt": "Prompt Négatif",
+                "negative_prompt_default": "couleurs vives, surexposé, statique, détails flous, sous-titres, style, œuvre d'art, peinture, image fixe, gris général, pire qualité, basse qualité, artefacts de compression JPEG, laid, incomplet, doigts supplémentaires, mains mal dessinées, visage mal dessiné, déformé, défiguré, membres malformés, doigts fusionnés, image statique, arrière-plan encombré, trois jambes, beaucoup de gens en arrière-plan, marchant à reculons",
+                "start_generation": "🎬 Démarrer Génération",
+                "width": "Largeur",
+                "height": "Hauteur",
+                "swap_dimensions": "🔄 Échanger Largeur/Hauteur",
+                "adjust_size": "Ajuster Taille Basée sur Image",
+                "guidance_scale": "Échelle de Guidage",
+                "sampling_steps": "Étapes d'Échantillonnage (Recommandé 50)",
+                "text_guide_scale": "Échelle de Guidage Texte",
+                "audio_guide_scale": "Échelle de Guidage Audio",
+                "motion_frame": "Image de Mouvement",
+                "fps": "FPS",
+                "overlap_window_length": "Longueur Fenêtre de Chevauchement",
+                "seed": "Graine (entier positif, -1 pour aléatoire)",
+                "status": "Statut",
+                "generated_result": "Résultat Généré",
+                "seed_output": "Graine"
+            },
+            "audio_extraction": {
+                "upload_video": "Télécharger Vidéo",
+                "start_extraction": "🎬 Démarrer Extraction",
+                "status": "Statut",
+                "generated_result": "Résultat Généré"
+            },
+            "vocal_separation": {
+                "upload_audio": "Télécharger Audio",
+                "start_separation": "🎬 Démarrer Séparation",
+                "status": "Statut",
+                "generated_result": "Résultat Généré"
+            }
+        }
+    elif language == "pt":  # Portuguese
+        return {
+            "main": {
+                "title": "StableAvatar",
+                "device_info": "Executando em: {device_summary} | Dispositivo: {device} | Tipo de dados: {dtype}",
+                "language_label": "Idioma",
+                "model_settings": "Configurações do Modelo",
+                "video_generation": "Geração de Vídeo",
+                "audio_extraction": "Extração de Áudio", 
+                "vocal_separation": "Separação Vocal"
+            },
+            "model_settings": {
+                "gpu_memory_mode": "Modo de Memória GPU",
+                "gpu_memory_info": "Normal usa 25G VRAM, model_cpu_offload usa 13G VRAM",
+                "teacache_threshold": "Limite TeaCache",
+                "teacache_info": "Recomendado 0.1, 0 desativa aceleração TeaCache",
+                "num_skip_start_steps": "Pular Passos Iniciais",
+                "skip_steps_info": "Recomendado 5",
+                "clip_sample_n_frames": "Quadros de Amostra Clip",
+                "clip_frames_info": "Quadros de vídeo, 81=2s@25fps, 161=4s@25fps, deve ser 4n+1",
+                "model_selection": "Modelo Transformer",
+                "model_selection_info": "Escolha o tipo de modelo transformer: Square (padrão) ou Rec-Vec (recomendado)"
+            },
+            "video_generation": {
+                "upload_image": "Carregar Imagem",
+                "upload_audio": "Carregar Áudio",
+                "prompt": "Prompt",
+                "negative_prompt": "Prompt Negativo",
+                "negative_prompt_default": "cores vivas, superexposto, estático, detalhes borrados, legendas, estilo, obra de arte, pintura, imagem fixa, cinza geral, pior qualidade, baixa qualidade, artefatos de compressão JPEG, feio, incompleto, dedos extras, mãos mal desenhadas, rosto mal desenhado, deformado, desfigurado, membros malformados, dedos fundidos, imagem estática, fundo desordenado, três pernas, muitas pessoas no fundo, andando para trás",
+                "start_generation": "🎬 Iniciar Geração",
+                "width": "Largura",
+                "height": "Altura",
+                "swap_dimensions": "🔄 Trocar Largura/Altura",
+                "adjust_size": "Ajustar Tamanho Baseado na Imagem",
+                "guidance_scale": "Escala de Orientação",
+                "sampling_steps": "Passos de Amostragem (Recomendado 50)",
+                "text_guide_scale": "Escala de Orientação de Texto",
+                "audio_guide_scale": "Escala de Orientação de Áudio",
+                "motion_frame": "Quadro de Movimento",
+                "fps": "FPS",
+                "overlap_window_length": "Comprimento da Janela de Sobreposição",
+                "seed": "Semente (inteiro positivo, -1 para aleatório)",
+                "status": "Status",
+                "generated_result": "Resultado Gerado",
+                "seed_output": "Semente"
+            },
+            "audio_extraction": {
+                "upload_video": "Carregar Vídeo",
+                "start_extraction": "🎬 Iniciar Extração",
+                "status": "Status",
+                "generated_result": "Resultado Gerado"
+            },
+            "vocal_separation": {
+                "upload_audio": "Carregar Áudio",
+                "start_separation": "🎬 Iniciar Separação",
+                "status": "Status",
+                "generated_result": "Resultado Gerado"
+            }
+        }
+    elif language == "ru":  # Russian
+        return {
+            "main": {
+                "title": "StableAvatar",
+                "device_info": "Запуск на: {device_summary} | Устройство: {device} | Тип данных: {dtype}",
+                "language_label": "Язык",
+                "model_settings": "Настройки Модели",
+                "video_generation": "Генерация Видео",
+                "audio_extraction": "Извлечение Аудио", 
+                "vocal_separation": "Разделение Вокала"
+            },
+            "model_settings": {
+                "gpu_memory_mode": "Режим Памяти GPU",
+                "gpu_memory_info": "Normal использует 25G VRAM, model_cpu_offload использует 13G VRAM",
+                "teacache_threshold": "Порог TeaCache",
+                "teacache_info": "Рекомендуется 0.1, 0 отключает ускорение TeaCache",
+                "num_skip_start_steps": "Пропустить Начальные Шаги",
+                "skip_steps_info": "Рекомендуется 5",
+                "clip_sample_n_frames": "Кадры Образца Clip",
+                "clip_frames_info": "Видеокадры, 81=2с@25fps, 161=4с@25fps, должно быть 4n+1",
+                "model_selection": "Модель Transformer",
+                "model_selection_info": "Выберите тип модели transformer: Square (стандартная) или Rec-Vec (рекомендуемая)"
+            },
+            "video_generation": {
+                "upload_image": "Загрузить Изображение",
+                "upload_audio": "Загрузить Аудио",
+                "prompt": "Промпт",
+                "negative_prompt": "Негативный Промпт",
+                "negative_prompt_default": "яркие цвета, переэкспонированный, статичный, размытые детали, субтитры, стиль, произведение искусства, живопись, неподвижное изображение, общий серый, худшее качество, низкое качество, артефакты сжатия JPEG, уродливый, неполный, лишние пальцы, плохо нарисованные руки, плохо нарисованное лицо, деформированный, обезображенный, неправильно сформированные конечности, сросшиеся пальцы, статичное изображение, загроможденный фон, три ноги, много людей на фоне, идущий назад",
+                "start_generation": "🎬 Начать Генерацию",
+                "width": "Ширина",
+                "height": "Высота",
+                "swap_dimensions": "🔄 Поменять Ширину/Высоту",
+                "adjust_size": "Настроить Размер на Основе Изображения",
+                "guidance_scale": "Шкала Направления",
+                "sampling_steps": "Шаги Семплирования (Рекомендуется 50)",
+                "text_guide_scale": "Шкала Направления Текста",
+                "audio_guide_scale": "Шкала Направления Аудио",
+                "motion_frame": "Кадр Движения",
+                "fps": "FPS",
+                "overlap_window_length": "Длина Окна Перекрытия",
+                "seed": "Семя (положительное целое, -1 для случайного)",
+                "status": "Статус",
+                "generated_result": "Сгенерированный Результат",
+                "seed_output": "Семя"
+            },
+            "audio_extraction": {
+                "upload_video": "Загрузить Видео",
+                "start_extraction": "🎬 Начать Извлечение",
+                "status": "Статус",
+                "generated_result": "Сгенерированный Результат"
+            },
+            "vocal_separation": {
+                "upload_audio": "Загрузить Аудио",
+                "start_separation": "🎬 Начать Разделение",
+                "status": "Статус",
+                "generated_result": "Сгенерированный Результат"
+            }
+        }
+    else:  # Chinese (zh)
+        return {
+            "main": {
+                "title": "StableAvatar",
+                "device_info": "运行环境: {device_summary} | 设备: {device} | 数据类型: {dtype}",
+                "language_label": "语言",
+                "model_settings": "模型设置",
+                "video_generation": "视频生成",
+                "audio_extraction": "音频提取",
+                "vocal_separation": "人声分离"
+            },
+            "model_settings": {
+                "gpu_memory_mode": "显存模式",
+                "gpu_memory_info": "Normal占用25G显存，model_cpu_offload占用13G显存",
+                "teacache_threshold": "teacache threshold",
+                "teacache_info": "推荐参数0.1，0为禁用teacache加速",
+                "num_skip_start_steps": "跳过开始步数",
+                "skip_steps_info": "推荐参数5",
+                "clip_sample_n_frames": "Clip采样帧数",
+                "clip_frames_info": "视频帧数，81=2秒@25fps，161=4秒@25fps，必须为4n+1",
+                "model_selection": "Transformer模型",
+                "model_selection_info": "选择transformer模型类型：Square（标准）或Rec-Vec（推荐）"
+            },
+            "video_generation": {
+                "upload_image": "上传图片",
+                "upload_audio": "上传音频",
+                "prompt": "提示词",
+                "negative_prompt": "负面提示词",
+                "negative_prompt_default": "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+                "start_generation": "🎬 开始生成",
+                "width": "宽度",
+                "height": "高度",
+                "swap_dimensions": "🔄 交换宽高",
+                "adjust_size": "根据图片调整宽高",
+                "guidance_scale": "guidance scale",
+                "sampling_steps": "采样步数（推荐50步）",
+                "text_guide_scale": "text guidance scale",
+                "audio_guide_scale": "audio guidance scale",
+                "motion_frame": "motion frame",
+                "fps": "帧率",
+                "overlap_window_length": "overlap window length",
+                "seed": "种子，请输入正整数，-1为随机",
+                "status": "提示信息",
+                "generated_result": "生成结果",
+                "seed_output": "种子"
+            },
+            "audio_extraction": {
+                "upload_video": "上传视频",
+                "start_extraction": "🎬 开始提取",
+                "status": "提示信息",
+                "generated_result": "生成结果"
+            },
+            "vocal_separation": {
+                "upload_audio": "上传音频",
+                "start_separation": "🎬 开始分离",
+                "status": "提示信息",
+                "generated_result": "生成结果"
+            }
+        }
+
+
+def get_display_language(language_code: str) -> str:
+    """
+    Convert language code to display name.
+    
+    Args:
+        language_code: Language code ('zh', 'en', 'es', 'de', 'ja', 'fr', 'pt', or 'ru')
+        
+    Returns:
+        str: Display name ('中文', 'English', 'Español', 'Deutsch', '日本語', 'Français', 'Português', or 'Русский')
+    """
+    language_map = {
+        "zh": "中文",
+        "en": "English", 
+        "es": "Español",
+        "de": "Deutsch",
+        "ja": "日本語",
+        "fr": "Français",
+        "pt": "Português",
+        "ru": "Русский"
+    }
+    return language_map.get(language_code, "中文")
+
+
+def get_language_choices() -> List[tuple]:
+    """
+    Get language choices for Gradio Radio component.
+    
+    Returns:
+        List of tuples (display_name, language_code)
+    """
+    return [
+        ("中文", "zh"), 
+        ("English", "en"), 
+        ("Español", "es"), 
+        ("Deutsch", "de"),
+        ("日本語", "ja"),
+        ("Français", "fr"),
+        ("Português", "pt"),
+        ("Русский", "ru")
+    ]
+
+
+def create_language_detection_js() -> str:
+    """
+    Create simple JavaScript code for client-side language detection.
+    This version only sets the radio button without triggering events to avoid conflicts.
+    
+    Returns:
+        str: JavaScript code for language detection
+    """
+    return """
+    <script>
+    function detectLanguage() {
+        const language = navigator.language || navigator.userLanguage;
+        const lang = language.toLowerCase();
+        
+        let langCode = 'en'; // default to English
+        if (lang.startsWith('zh')) {
+            langCode = 'zh';
+        } else if (lang.startsWith('en')) {
+            langCode = 'en';
+        } else if (lang.startsWith('es')) {
+            langCode = 'es';
+        } else if (lang.startsWith('de')) {
+            langCode = 'de';
+        } else if (lang.startsWith('ja')) {
+            langCode = 'ja';
+        } else if (lang.startsWith('fr')) {
+            langCode = 'fr';
+        } else if (lang.startsWith('pt')) {
+            langCode = 'pt';
+        } else if (lang.startsWith('ru')) {
+            langCode = 'ru';
+        }
+        
+        // Map language codes to display names
+        const langMap = {
+            'zh': '中文',
+            'en': 'English',
+            'es': 'Español',
+            'de': 'Deutsch',
+            'ja': '日本語',
+            'fr': 'Français',
+            'pt': 'Português',
+            'ru': 'Русский'
+        };
+        
+        const displayName = langMap[langCode];
+        console.log('Detected language:', language, '->', langCode, '->', displayName);
+        
+        // Simple function to set language without triggering events
+        function setLanguage() {
+            const radioButtons = document.querySelectorAll('input[type="radio"]');
+            radioButtons.forEach(radio => {
+                if (radio.value === displayName) {
+                    console.log('Setting language to:', displayName);
+                    radio.checked = true;
+                    // Don't trigger any events to avoid conflicts
+                }
+            });
+        }
+        
+        // Try to set language after a delay
+        setTimeout(setLanguage, 2000);
+    }
+    
+    // Run language detection when the page loads
+    document.addEventListener('DOMContentLoaded', detectLanguage);
+    </script>
+    """
+
+
+if __name__ == "__main__":
+    # Test the language detection
+    test_headers = [
+        "zh-CN,zh;q=0.9,en;q=0.8",
+        "en-US,en;q=0.9",
+        "es-ES,es;q=0.9,en;q=0.8",
+        "de-DE,de;q=0.9,en;q=0.8",
+        "ja-JP,ja;q=0.9,en;q=0.8",
+        "fr-FR,fr;q=0.9,en;q=0.8",
+        "pt-BR,pt;q=0.9,en;q=0.8",
+        "ru-RU,ru;q=0.9,en;q=0.8",
+        "zh-TW,zh;q=0.9"
+    ]
+    
+    print("Testing language detection:")
+    for header in test_headers:
+        detected = detect_browser_language(header)
+        display_name = get_display_language(detected)
+        print(f"Header: {header} -> Detected: {detected} ({display_name})")
+    
+    print("\nTesting interface texts:")
+    for lang in ["zh", "en", "es", "de", "ja", "fr", "pt", "ru"]:
+        texts = get_interface_texts(lang)
+        display_name = get_display_language(lang)
+        print(f"\n{display_name} ({lang.upper()}):")
+        print(f"  Title: {texts['main']['title']}")
+        print(f"  Language Label: {texts['main']['language_label']}")
+        print(f"  Start Generation: {texts['video_generation']['start_generation']}")
+        print(f"  Upload Image: {texts['video_generation']['upload_image']}")
+    
+    print("\nTesting language choices:")
+    choices = get_language_choices()
+    for display_name, lang_code in choices:
+        print(f"  {display_name} -> {lang_code}")
diff --git a/wan/utils/lora_utils.py b/wan/utils/lora_utils.py
old mode 100644
new mode 100755
index 9748364..7a2f698
--- a/wan/utils/lora_utils.py
+++ b/wan/utils/lora_utils.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import hashlib
 import math
 import os
diff --git a/wan/utils/prompt_extend.py b/wan/utils/prompt_extend.py
old mode 100644
new mode 100755
index f3981a8..cf81f75
--- a/wan/utils/prompt_extend.py
+++ b/wan/utils/prompt_extend.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import json
 import math
diff --git a/wan/utils/qwen_vl_utils.py b/wan/utils/qwen_vl_utils.py
old mode 100644
new mode 100755
index 3c682e6..fe6619e
--- a/wan/utils/qwen_vl_utils.py
+++ b/wan/utils/qwen_vl_utils.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copied from https://github.com/kq-chen/qwen-vl-utils
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 from __future__ import annotations
diff --git a/wan/utils/utils.py b/wan/utils/utils.py
old mode 100644
new mode 100755
index 4cb9052..97c68ac
--- a/wan/utils/utils.py
+++ b/wan/utils/utils.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import argparse
 import binascii

From 7defa77396ed2a1cfaea3cbfdfe2c8d0e2ea6014 Mon Sep 17 00:00:00 2001
From: jaminmc <1310376+jaminmc@users.noreply.github.com>
Date: Sun, 14 Sep 2025 22:57:05 -0400
Subject: [PATCH 2/2] Update .gitignore

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index a7797d5..20e99aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,6 @@ checkpoints
 
 # Generated outputs
 outputs/
-requirements.txt
 
 # Virtual environments
 venv/