From 45ba769650b622a3f06ac242841e154d3730f8bb Mon Sep 17 00:00:00 2001
From: maitrisavaliya <maitrisavaliya@gmail.com>
Date: Tue, 9 Dec 2025 09:57:22 +0530
Subject: [PATCH 1/9] Add troubleshooting guide for common installation and
 usage issues

---
 TROUBLESHOOTING.md | 1720 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1720 insertions(+)
 create mode 100644 TROUBLESHOOTING.md

diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
new file mode 100644
index 0000000..d26484f
--- /dev/null
+++ b/TROUBLESHOOTING.md
@@ -0,0 +1,1720 @@
+\# VibeVoice Comprehensive Troubleshooting Guide
+
+
+
+A complete reference for resolving common issues when installing and running Microsoft's VibeVoice text-to-speech models.
+
+
+
+---
+
+
+
+\## Table of Contents
+
+1\. \[Installation Issues](#installation-issues)
+
+2\. \[Hardware and Memory Problems](#hardware-and-memory-problems)
+
+3\. \[Model Loading Errors](#model-loading-errors)
+
+4\. \[Audio Quality and Generation Issues](#audio-quality-and-generation-issues)
+
+5\. \[Platform-Specific Issues](#platform-specific-issues)
+
+6\. \[Performance Optimization](#performance-optimization)
+
+7\. \[Known Limitations](#known-limitations)
+
+
+
+---
+
+
+
+\## Installation Issues
+
+
+
+\### Issue 1: Flash Attention Installation Fails on Windows
+
+
+
+\*\*Problem\*\*: Building Flash Attention from source fails with compilation errors on Windows systems.
+
+
+
+\*\*Symptoms\*\*:
+
+\- ImportError: FlashAttention2 package not found
+
+\- Compilation errors during pip install
+
+\- Visual Studio build errors
+
+
+
+\*\*Solutions\*\*:
+
+
+
+\*\*Option A: Use Pre-built Wheels (Recommended for Windows)\*\*
+
+```bash
+
+\# Download pre-built Flash Attention wheels from:
+
+\# https://github.com/kingbri1/flash-attention/releases
+
+
+
+\# Install the wheel matching your configuration:
+
+\# - Python version (3.10, 3.11, etc.)
+
+\# - CUDA version (12.1, 12.4, etc.)
+
+\# - PyTorch version
+
+
+
+pip install flash\_attn-2.x.x-cpxxx-cpxxx-win\_amd64.whl
+
+```
+
+
+
+\*\*Option B: Run Without Flash Attention\*\*
+
+```bash
+
+\# Modify the model loading to use eager attention instead
+
+\# Add this flag when running inference:
+
+--attn\_implementation eager
+
+```
+
+
+
+\*\*Option C: Use Docker (Most Reliable)\*\*
+
+```bash
+
+\# Pull NVIDIA PyTorch container
+
+docker pull nvcr.io/nvidia/pytorch:24.08-py3
+
+
+
+\# Run VibeVoice inside container
+
+docker run --gpus all -it nvcr.io/nvidia/pytorch:24.08-py3
+
+```
+
+
+
+\### Issue 2: Transformers Version Incompatibility
+
+
+
+\*\*Problem\*\*: Error message "Transformers does not recognize `vibevoice` architecture"
+
+
+
+\*\*Symptoms\*\*:
+
+```
+
+ValueError: The checkpoint you are trying to load has model type `vibevoice` 
+
+but Transformers does not recognize this architecture.
+
+```
+
+
+
+\*\*Solutions\*\*:
+
+
+
+\*\*Step 1: Update Transformers\*\*
+
+```bash
+
+\# Install the latest development version
+
+pip install git+https://github.com/huggingface/transformers.git
+
+
+
+\# Or install from specific PR if needed
+
+pip install git+https://github.com/huggingface/transformers.git@refs/pull/40546/head
+
+```
+
+
+
+\*\*Step 2: Verify Installation\*\*
+
+```python
+
+import transformers
+
+print(transformers.\_\_version\_\_)  # Should be 4.51.3 or higher
+
+```
+
+
+
+\*\*Step 3: Clear Cache\*\*
+
+```bash
+
+\# Clear HuggingFace cache if issues persist
+
+rm -rf ~/.cache/huggingface/hub/models--microsoft--VibeVoice\*
+
+```
+
+
+
+\### Issue 3: CUDA and PyTorch Version Mismatch
+
+
+
+\*\*Problem\*\*: CUDA errors or "Torch not compiled with CUDA enabled"
+
+
+
+\*\*Symptoms\*\*:
+
+\- AssertionError: Torch not compiled with CUDA enabled
+
+\- CUDA runtime errors
+
+\- GPU not detected
+
+
+
+\*\*Solution\*\*:
+
+```bash
+
+\# Check your CUDA version
+
+nvidia-smi
+
+
+
+\# Install matching PyTorch (example for CUDA 12.1)
+
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+
+
+
+\# Verify CUDA is available in PyTorch
+
+python -c "import torch; print(torch.cuda.is\_available())"
+
+```
+
+
+
+\*\*Requirements\*\*:
+
+\- CUDA 12.x or higher
+
+\- cuBLAS and cuDNN 9.x
+
+\- Matching PyTorch version
+
+
+
+---
+
+
+
+\## Hardware and Memory Problems
+
+
+
+\### Issue 4: CUDA Out of Memory (OOM)
+
+
+
+\*\*Problem\*\*: GPU runs out of VRAM during model loading or inference
+
+
+
+\*\*Symptoms\*\*:
+
+```
+
+RuntimeError: CUDA out of memory. Tried to allocate X.XX GiB
+
+```
+
+
+
+\*\*VRAM Requirements\*\*:
+
+\- \*\*VibeVoice-1.5B\*\*: ~7 GB VRAM (works on RTX 3060 8GB)
+
+\- \*\*VibeVoice-7B FP16\*\*: ~19-24 GB VRAM (RTX 3090/4090 or A5000+)
+
+\- \*\*VibeVoice-7B 8-bit\*\*: ~12 GB VRAM
+
+\- \*\*VibeVoice-7B 4-bit\*\*: ~6-8 GB VRAM
+
+\- \*\*VibeVoice-Realtime-0.5B\*\*: ~4 GB VRAM
+
+
+
+\*\*Solutions by GPU Size\*\*:
+
+
+
+\*\*For 8GB GPUs (RTX 3060, 3070, etc.)\*\*:
+
+```bash
+
+\# Option 1: Use 1.5B model
+
+python demo/inference\_from\_file.py \\
+
+&nbsp; --model\_path vibevoice/VibeVoice-1.5B \\
+
+&nbsp; --txt\_path your\_text.txt \\
+
+&nbsp; --speaker\_names Alice Bob
+
+
+
+\# Option 2: Use 7B with 4-bit quantization
+
+python demo/inference\_from\_file.py \\
+
+&nbsp; --model\_path Dannidee/VibeVoice7b-low-vram \\
+
+&nbsp; --txt\_path your\_text.txt \\
+
+&nbsp; --speaker\_names Alice Bob
+
+```
+
+
+
+\*\*For 12GB GPUs (RTX 3060 Ti, 4070, etc.)\*\*:
+
+```bash
+
+\# Use 8-bit quantization
+
+python demo/inference\_from\_file.py \\
+
+&nbsp; --model\_path vibevoice/VibeVoice-7B \\
+
+&nbsp; --load\_in\_8bit \\
+
+&nbsp; --txt\_path your\_text.txt \\
+
+&nbsp; --speaker\_names Alice Bob
+
+```
+
+
+
+\*\*For 24GB+ GPUs (RTX 3090, 4090, A5000+)\*\*:
+
+```bash
+
+\# Use full precision 7B model
+
+python demo/inference\_from\_file.py \\
+
+&nbsp; --model\_path vibevoice/VibeVoice-7B \\
+
+&nbsp; --txt\_path your\_text.txt \\
+
+&nbsp; --speaker\_names Alice Bob
+
+```
+
+
+
+\### Issue 5: Quantization Setup
+
+
+
+\*\*Problem\*\*: Need to reduce VRAM usage through quantization
+
+
+
+\*\*4-bit Quantization (Best for Low VRAM)\*\*:
+
+```bash
+
+\# Install bitsandbytes
+
+pip install bitsandbytes
+
+
+
+\# Option 1: Use pre-quantized model
+
+git clone https://huggingface.co/Dannidee/VibeVoice7b-low-vram
+
+
+
+\# Option 2: Quantize your own model
+
+python quantize\_and\_save\_vibevoice.py \\
+
+&nbsp; --model\_path /path/to/original/model \\
+
+&nbsp; --output\_dir /path/to/output/4bit \\
+
+&nbsp; --bits 4 \\
+
+&nbsp; --test
+
+```
+
+
+
+\*\*8-bit Quantization (Better Quality)\*\*:
+
+```python
+
+from transformers import BitsAndBytesConfig
+
+
+
+quantization\_config = BitsAndBytesConfig(
+
+&nbsp;   load\_in\_8bit=True,
+
+&nbsp;   llm\_int8\_threshold=6.0,
+
+)
+
+
+
+\# Load model with config
+
+model = VibeVoiceForConditionalGeneration.from\_pretrained(
+
+&nbsp;   "vibevoice/VibeVoice-7B",
+
+&nbsp;   quantization\_config=quantization\_config,
+
+&nbsp;   device\_map="auto"
+
+)
+
+```
+
+
+
+\*\*Important Notes\*\*:
+
+\- 4-bit quantization: ~6.6GB VRAM, minimal quality loss
+
+\- 8-bit quantization: ~12GB VRAM, better quality
+
+\- Requires CUDA GPU and bitsandbytes library
+
+\- Audio quality remains excellent with proper quantization
+
+
+
+\### Issue 6: CPU-Only Usage
+
+
+
+\*\*Problem\*\*: Trying to run VibeVoice without a GPU
+
+
+
+\*\*Reality Check\*\*:
+
+\- CPU inference is \*\*extremely slow\*\* and not practical
+
+\- Model requires CUDA-enabled GPU for reasonable performance
+
+\- Even with 32GB RAM, CPU mode is not recommended
+
+
+
+\*\*Workaround (Not Recommended)\*\*:
+
+```bash
+
+\# Force CPU mode (will be very slow)
+
+python demo/inference\_from\_file.py \\
+
+&nbsp; --model\_path vibevoice/VibeVoice-1.5B \\
+
+&nbsp; --device cpu \\
+
+&nbsp; --attn\_implementation eager \\
+
+&nbsp; --txt\_path your\_text.txt \\
+
+&nbsp; --speaker\_names Alice
+
+```
+
+
+
+\*\*Better Alternative\*\*:
+
+\- Use Google Colab with free GPU
+
+\- Use cloud GPU services (AWS, GCP, RunPod)
+
+\- Access community-hosted demos online
+
+
+
+---
+
+
+
+\## Model Loading Errors
+
+
+
+\### Issue 7: "Failed to load VibeVoice processor" Error
+
+
+
+\*\*Problem\*\*: Model fails to load with NoneType or path errors
+
+
+
+\*\*Symptoms\*\*:
+
+```
+
+Error: expected str, bytes or os.PathLike object, not NoneType
+
+Please ensure transformers>=4.51.3 is installed
+
+```
+
+
+
+\*\*Solutions\*\*:
+
+
+
+\*\*Step 1: Verify Model Path\*\*
+
+```bash
+
+\# Check if model files exist
+
+ls -la ~/.cache/huggingface/hub/models--microsoft--VibeVoice-1.5B/
+
+
+
+\# Or for local models
+
+ls -la /path/to/your/model/
+
+```
+
+
+
+\*\*Step 2: Reinstall Transformers\*\*
+
+```bash
+
+pip uninstall transformers -y
+
+pip install transformers>=4.51.3
+
+```
+
+
+
+\*\*Step 3: Clear and Re-download\*\*
+
+```python
+
+from huggingface\_hub import snapshot\_download
+
+
+
+\# Download fresh copy
+
+snapshot\_download(
+
+&nbsp;   repo\_id="microsoft/VibeVoice-1.5B",
+
+&nbsp;   local\_dir="./models/VibeVoice-1.5B",
+
+&nbsp;   force\_download=True
+
+)
+
+```
+
+
+
+\### Issue 8: Invalid Device String Error
+
+
+
+\*\*Problem\*\*: Error "Invalid device string: '0'"
+
+
+
+\*\*Symptoms\*\*:
+
+```
+
+RuntimeError: Invalid device string: '0'
+
+Failed to load model even with eager attention
+
+```
+
+
+
+\*\*Solution\*\*:
+
+```python
+
+\# Change device specification from '0' to 'cuda:0' or 'cuda'
+
+
+
+\# Wrong:
+
+device = '0'
+
+
+
+\# Correct:
+
+device = 'cuda:0'
+
+\# or
+
+device = 'cuda'
+
+\# or for CPU
+
+device = 'cpu'
+
+```
+
+
+
+\*\*In command line\*\*:
+
+```bash
+
+\# Instead of --device 0
+
+python inference.py --device cuda:0
+
+
+
+\# Or let it auto-detect
+
+python inference.py --device cuda
+
+```
+
+
+
+\### Issue 9: DynamicCache Memory Issues
+
+
+
+\*\*Problem\*\*: Long sequences cause cache-related crashes
+
+
+
+\*\*Symptoms\*\*:
+
+```
+
+RuntimeError: CUDA out of memory during cache update
+
+DynamicCache overflow
+
+```
+
+
+
+\*\*Solution - Modify Code\*\*:
+
+
+
+Edit `modeling\_vibevoice\_inference.py` around line 518:
+
+
+
+```python
+
+\# Add try-except block to handle cache updates
+
+try:
+
+&nbsp;   for layer\_idx, (k\_cache, v\_cache) in enumerate(
+
+&nbsp;       zip(
+
+&nbsp;           negative\_model\_kwargs\['past\_key\_values'].key\_cache,
+
+&nbsp;           negative\_model\_kwargs\['past\_key\_values'].value\_cache
+
+&nbsp;       )
+
+&nbsp;   ):
+
+&nbsp;       for sample\_idx in diffusion\_start\_indices.tolist():
+
+&nbsp;           k\_cache\[sample\_idx, :, -1, :] = k\_cache\[sample\_idx, :, 0, :].clone()
+
+&nbsp;           v\_cache\[sample\_idx, :, -1, :] = v\_cache\[sample\_idx, :, 0, :].clone()
+
+except Exception as e:
+
+&nbsp;   print(f"Cache update failed: {e}")
+
+&nbsp;   negative\_model\_kwargs\['past\_key\_values'] = None
+
+```
+
+
+
+\*\*Alternative\*\*:
+
+```python
+
+\# Disable caching for long sequences
+
+model.generation\_config.use\_cache = False
+
+```
+
+
+
+---
+
+
+
+\## Audio Quality and Generation Issues
+
+
+
+\### Issue 10: Poor Audio Quality or Robotic Voice
+
+
+
+\*\*Problem\*\*: Generated speech sounds distorted, robotic, or unnatural
+
+
+
+\*\*Causes and Solutions\*\*:
+
+
+
+\*\*1. Wrong Language Input\*\*
+
+```bash
+
+\# VibeVoice-1.5B/7B: English and Chinese only
+
+\# Other languages produce poor results
+
+
+
+\# Bad:
+
+text = "Bonjour, comment allez-vous?"  # French not supported
+
+
+
+\# Good:
+
+text = "Hello, how are you?"  # English
+
+text = "你好，你好吗？"  # Chinese
+
+```
+
+
+
+\*\*2. Improper Text Formatting\*\*
+
+```python
+
+\# Bad - No speaker labels
+
+text = "Hello. How are you? I'm fine."
+
+
+
+\# Good - Clear speaker labels
+
+text = """
+
+Speaker 1: Hello, how are you?
+
+Speaker 2: I'm fine, thank you! How about you?
+
+Speaker 1: I'm doing great!
+
+"""
+
+
+
+\# Alternative format
+
+text = """
+
+\[1] Hello, how are you?
+
+\[2] I'm fine, thank you! How about you?
+
+\[1] I'm doing great!
+
+"""
+
+```
+
+
+
+\*\*3. Text Too Long (Acceleration Issues)\*\*
+
+```python
+
+\# Problem: Texts over 250 words cause audio to speed up
+
+
+
+\# Solution: Split into chunks
+
+def split\_text\_into\_chunks(text, max\_words=250):
+
+&nbsp;   words = text.split()
+
+&nbsp;   chunks = \[]
+
+&nbsp;   for i in range(0, len(words), max\_words):
+
+&nbsp;       chunk = ' '.join(words\[i:i+max\_words])
+
+&nbsp;       chunks.append(chunk)
+
+&nbsp;   return chunks
+
+
+
+\# Process each chunk separately
+
+for chunk in chunks:
+
+&nbsp;   audio = generate\_speech(chunk)
+
+&nbsp;   concatenated\_audio.append(audio)
+
+```
+
+
+
+\*\*4. Wrong Sample Rate\*\*
+
+```python
+
+\# VibeVoice expects 24kHz
+
+\# Ensure reference audio matches
+
+import librosa
+
+
+
+audio, sr = librosa.load('reference.wav', sr=24000)
+
+```
+
+
+
+\### Issue 11: Background Music and Artifacts
+
+
+
+\*\*Problem\*\*: Unwanted background music, chimes, or sound effects appear
+
+
+
+\*\*Explanation\*\*:
+
+\- This is a known behavior, especially in 1.5B model
+
+\- Model sometimes adds background music or effects
+
+\- More common in longer generations (45-90 minutes)
+
+
+
+\*\*Mitigation\*\*:
+
+```bash
+
+\# Use specific speakers that don't include background music
+
+\# Avoid speakers with '\_bgm' suffix unless you want music
+
+
+
+\# Without background music:
+
+--speaker\_names Alice Frank Carter
+
+
+
+\# With background music (intentional):
+
+--speaker\_names Mary\_woman\_bgm Anchen\_man\_bgm
+
+```
+
+
+
+\*\*Post-Processing\*\*:
+
+```python
+
+\# Use audio editing to remove unwanted sounds
+
+from pydub import AudioSegment
+
+from pydub.effects import high\_pass\_filter, low\_pass\_filter
+
+
+
+audio = AudioSegment.from\_wav("output.wav")
+
+\# Apply filters to reduce artifacts
+
+filtered = high\_pass\_filter(audio, cutoff=80)
+
+filtered = low\_pass\_filter(filtered, cutoff=8000)
+
+filtered.export("cleaned.wav", format="wav")
+
+```
+
+
+
+\### Issue 12: Inconsistent Speaker Voices
+
+
+
+\*\*Problem\*\*: Speaker voice changes mid-generation or speakers sound too similar
+
+
+
+\*\*Causes\*\*:
+
+1\. Insufficient voice reference samples
+
+2\. Similar speaker names confusing the model
+
+3\. Very long sequences (45+ minutes)
+
+
+
+\*\*Solutions\*\*:
+
+
+
+\*\*1. Use Distinct Voice References\*\*
+
+```python
+
+\# Provide clear, different reference samples
+
+speaker\_voices = {
+
+&nbsp;   "Alice": "demo/voices/en-Alice\_woman.wav",
+
+&nbsp;   "Frank": "demo/voices/en-Frank\_man.wav",
+
+&nbsp;   "Carter": "demo/voices/en-Carter\_man.wav",
+
+}
+
+```
+
+
+
+\*\*2. Use Clear, Distinct Speaker Names\*\*
+
+```bash
+
+\# Bad - Similar names
+
+--speaker\_names John Jon Jean
+
+
+
+\# Good - Distinct names
+
+--speaker\_names Alice Bob Carol Dave
+
+```
+
+
+
+\*\*3. Keep Generation Length Reasonable\*\*
+
+\- 1.5B model: Best quality under 30 minutes
+
+\- 7B model: Better quality for 45-90 minute content
+
+\- Split very long content into segments
+
+
+
+\### Issue 13: No Overlapping Speech Support
+
+
+
+\*\*Problem\*\*: Cannot generate two speakers talking simultaneously
+
+
+
+\*\*Reality\*\*:
+
+\- This is a \*\*known limitation\*\* of current VibeVoice models
+
+\- Model generates turn-based dialogue only
+
+\- No interruptions or overlapping speech
+
+
+
+\*\*Workaround\*\*:
+
+\- Design dialogue with clear turn-taking
+
+\- Use post-processing to manually mix audio if needed
+
+\- Wait for future model updates
+
+
+
+---
+
+
+
+\## Platform-Specific Issues
+
+
+
+\### Issue 14: Windows-Specific Problems
+
+
+
+\*\*Path Issues\*\*:
+
+```python
+
+\# Use raw strings or forward slashes
+
+\# Bad:
+
+path = "C:\\Users\\name\\VibeVoice\\model"
+
+
+
+\# Good:
+
+path = r"C:\\Users\\name\\VibeVoice\\model"
+
+\# or
+
+path = "C:/Users/name/VibeVoice/model"
+
+```
+
+
+
+\*\*Visual Studio Build Tools\*\*:
+
+```bash
+
+\# Required for building certain packages
+
+\# Download from: https://visualstudio.microsoft.com/downloads/
+
+\# Install "Desktop development with C++"
+
+```
+
+
+
+\*\*Line Ending Issues\*\*:
+
+```bash
+
+\# Convert Linux line endings to Windows if needed
+
+git config --global core.autocrlf true
+
+```
+
+
+
+\### Issue 15: macOS Apple Silicon (M1/M2/M3)
+
+
+
+\*\*Problem\*\*: Running on Mac with Apple Silicon GPUs
+
+
+
+\*\*Solution - Use MPS Backend\*\*:
+
+```python
+
+import torch
+
+
+
+\# Check MPS availability
+
+if torch.backends.mps.is\_available():
+
+&nbsp;   device = "mps"
+
+else:
+
+&nbsp;   device = "cpu"
+
+
+
+\# Load model to MPS
+
+model = model.to(device)
+
+```
+
+
+
+\*\*Performance Notes\*\*:
+
+\- MPS backend is significantly faster than CPU
+
+\- Still slower than NVIDIA GPUs
+
+\- Best for 1.5B model, 7B may be slow
+
+
+
+\*\*ComfyUI Integration\*\*:
+
+\- Recent ComfyUI-VibeVoice versions auto-detect MPS
+
+\- Provides good performance improvements over CPU
+
+
+
+\### Issue 16: Linux Permission Issues
+
+
+
+\*\*Problem\*\*: Permission denied when accessing model files
+
+
+
+\*\*Solution\*\*:
+
+```bash
+
+\# Fix permissions
+
+chmod -R 755 ~/.cache/huggingface/
+
+chmod -R 755 ~/VibeVoice/
+
+
+
+\# If using Docker, ensure proper volume mounting
+
+docker run --gpus all \\
+
+&nbsp; -v ~/.cache/huggingface:/root/.cache/huggingface \\
+
+&nbsp; -v $(pwd):/workspace \\
+
+&nbsp; -it pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
+
+```
+
+
+
+---
+
+
+
+\## Performance Optimization
+
+
+
+\### Issue 17: Slow Generation Speed
+
+
+
+\*\*Problem\*\*: Inference takes too long
+
+
+
+\*\*Optimization Strategies\*\*:
+
+
+
+\*\*1. Enable Flash Attention\*\*
+
+```python
+
+\# Fastest attention mechanism
+
+model = AutoModelForConditionalGeneration.from\_pretrained(
+
+&nbsp;   "vibevoice/VibeVoice-1.5B",
+
+&nbsp;   attn\_implementation="flash\_attention\_2",
+
+&nbsp;   torch\_dtype=torch.bfloat16,
+
+)
+
+```
+
+
+
+\*\*2. Use Optimal Diffusion Steps\*\*
+
+```bash
+
+\# Default is 20, can reduce for speed
+
+\# Quality vs Speed trade-off:
+
+--diffusion\_steps 10  # Fast, lower quality
+
+--diffusion\_steps 20  # Balanced (recommended)
+
+--diffusion\_steps 50  # Slow, highest quality
+
+```
+
+
+
+\*\*3. Batch Processing\*\*
+
+```python
+
+\# Process multiple short texts efficiently
+
+\# Instead of one 10-minute generation
+
+\# Do five 2-minute generations in parallel
+
+```
+
+
+
+\*\*4. Use Appropriate Model Size\*\*
+
+```bash
+
+\# For simple, short generations
+
+--model\_path microsoft/VibeVoice-Realtime-0.5B
+
+
+
+\# For quality multi-speaker
+
+--model\_path vibevoice/VibeVoice-1.5B
+
+
+
+\# For best quality long-form
+
+--model\_path vibevoice/VibeVoice-7B
+
+```
+
+
+
+\### Issue 18: High VRAM Usage During Long Generations
+
+
+
+\*\*Problem\*\*: VRAM usage grows over time in long sequences
+
+
+
+\*\*Solutions\*\*:
+
+
+
+\*\*1. Clear Cache Periodically\*\*
+
+```python
+
+import torch
+
+
+
+\# Clear cache between generations
+
+torch.cuda.empty\_cache()
+
+```
+
+
+
+\*\*2. Use Gradient Checkpointing\*\*
+
+```python
+
+model.gradient\_checkpointing\_enable()
+
+```
+
+
+
+\*\*3. Enable Memory Cleanup\*\*
+
+```python
+
+\# In ComfyUI-VibeVoice
+
+free\_memory\_after\_generate = True
+
+```
+
+
+
+---
+
+
+
+\## Known Limitations
+
+
+
+\### Current Model Constraints
+
+
+
+\*\*Language Support\*\*:
+
+\- \*\*VibeVoice-1.5B/7B\*\*: English and Chinese only
+
+\- \*\*VibeVoice-Realtime-0.5B\*\*: English only
+
+\- Experimental speakers (DE, FR, IT, JP, KR, NL, PL, PT, ES) - quality varies
+
+
+
+\*\*Audio Limitations\*\*:
+
+\- No background noise generation
+
+\- No music generation (except BGM speakers)
+
+\- No sound effects
+
+\- No overlapping/simultaneous speech
+
+
+
+\*\*Technical Constraints\*\*:
+
+\- Maximum 4 speakers in multi-speaker mode
+
+\- Realtime model: Single speaker only
+
+\- 1.5B: Best under 30 minutes
+
+\- 7B: Best under 90 minutes
+
+
+
+\*\*Quality Issues\*\*:
+
+\- Voice consistency may degrade in very long sequences
+
+\- Random background music/effects may appear
+
+\- 1.5B has more artifacts than 7B
+
+
+
+\### Repository Status
+
+
+
+\*\*Important\*\*: As of September 2025, the official Microsoft repository was temporarily disabled due to misuse concerns. The code has since been restored but without executable code. Use community forks for active development:
+
+
+
+\- Community Fork: https://github.com/vibevoice-community/VibeVoice
+
+\- Model Hub: https://huggingface.co/collections/microsoft/vibevoice
+
+
+
+---
+
+
+
+\## Getting Additional Help
+
+
+
+\### Community Resources
+
+
+
+\*\*Discord\*\*: Unofficial VibeVoice Discord server for real-time help
+
+
+
+\*\*GitHub Issues\*\*: 
+
+\- Microsoft: https://github.com/microsoft/VibeVoice/issues
+
+\- Community: https://github.com/vibevoice-community/VibeVoice/issues
+
+
+
+\*\*HuggingFace Discussions\*\*: https://huggingface.co/microsoft/VibeVoice-1.5B/discussions
+
+
+
+\### Reporting Issues
+
+
+
+When reporting problems, include:
+
+1\. VibeVoice model version (1.5B, 7B, Realtime)
+
+2\. Operating system and version
+
+3\. GPU model and VRAM
+
+4\. Python version
+
+5\. Transformers version
+
+6\. Complete error traceback
+
+7\. Minimal reproduction code
+
+
+
+\*\*Example Issue Report\*\*:
+
+```
+
+Environment:
+
+\- Model: VibeVoice-7B
+
+\- OS: Windows 11
+
+\- GPU: RTX 3060 Ti (8GB)
+
+\- Python: 3.10.12
+
+\- Transformers: 4.51.3
+
+\- CUDA: 12.1
+
+
+
+Problem:
+
+CUDA out of memory error during 20-minute generation
+
+
+
+Error:
+
+RuntimeError: CUDA out of memory. Tried to allocate 2.5 GiB
+
+
+
+Steps to reproduce:
+
+1\. Load model with default settings
+
+2\. Process 5000-word text file
+
+3\. Error occurs at ~15 minutes
+
+
+
+Expected behavior:
+
+Should complete 20-minute generation
+
+```
+
+
+
+---
+
+
+
+\## Quick Reference: Common Commands
+
+
+
+\### Basic Inference
+
+```bash
+
+\# Single speaker
+
+python demo/inference\_from\_file.py \\
+
+&nbsp; --model\_path vibevoice/VibeVoice-1.5B \\
+
+&nbsp; --txt\_path input.txt \\
+
+&nbsp; --speaker\_names Alice
+
+
+
+\# Multi-speaker
+
+python demo/inference\_from\_file.py \\
+
+&nbsp; --model\_path vibevoice/VibeVoice-7B \\
+
+&nbsp; --txt\_path dialogue.txt \\
+
+&nbsp; --speaker\_names Alice Bob Carol
+
+```
+
+
+
+\### Memory-Optimized
+
+```bash
+
+\# 4-bit quantization
+
+python demo/inference\_from\_file.py \\
+
+&nbsp; --model\_path Dannidee/VibeVoice7b-low-vram \\
+
+&nbsp; --txt\_path input.txt \\
+
+&nbsp; --speaker\_names Alice Bob
+
+
+
+\# 8-bit with eager attention
+
+python demo/inference\_from\_file.py \\
+
+&nbsp; --model\_path vibevoice/VibeVoice-7B \\
+
+&nbsp; --load\_in\_8bit \\
+
+&nbsp; --attn\_implementation eager \\
+
+&nbsp; --txt\_path input.txt \\
+
+&nbsp; --speaker\_names Alice
+
+```
+
+
+
+\### Gradio Demo
+
+```bash
+
+\# Launch web interface
+
+python demo/gradio\_demo.py \\
+
+&nbsp; --model\_path vibevoice/VibeVoice-1.5B \\
+
+&nbsp; --device cuda \\
+
+&nbsp; --share
+
+```
+
+
+
+\### Realtime Streaming
+
+```bash
+
+\# Streaming TTS
+
+python demo/streaming\_inference\_from\_file.py \\
+
+&nbsp; --model\_path microsoft/VibeVoice-Realtime-0.5B \\
+
+&nbsp; --txt\_path input.txt \\
+
+&nbsp; --speaker\_name Emma
+
+```
+
+
+
+---
+
+
+
+\## Maintenance and Updates
+
+
+
+\### Keeping System Updated
+
+
+
+```bash
+
+\# Update core packages
+
+pip install --upgrade transformers accelerate torch
+
+
+
+\# Update VibeVoice code (community fork)
+
+cd VibeVoice
+
+git pull origin main
+
+
+
+\# Clear old cached models
+
+rm -rf ~/.cache/huggingface/hub/models--microsoft--VibeVoice\*
+
+
+
+\# Re-download fresh models
+
+python -c "from huggingface\_hub import snapshot\_download; snapshot\_download('microsoft/VibeVoice-1.5B')"
+
+```
+
+
+
+\### Monitoring Performance
+
+
+
+```bash
+
+\# Watch GPU usage in real-time
+
+watch -n 0.5 nvidia-smi
+
+
+
+\# Profile memory usage
+
+python -m memory\_profiler your\_script.py
+
+
+
+\# Check CUDA memory in Python
+
+import torch
+
+print(f"Allocated: {torch.cuda.memory\_allocated() / 1e9:.2f} GB")
+
+print(f"Cached: {torch.cuda.memory\_reserved() / 1e9:.2f} GB")
+
+```
+
+
+
+---
+
+
+
+\## Final Notes
+
+
+
+This guide covers the most common issues encountered when using VibeVoice. The project is actively being developed, and solutions may change as new versions are released. Always check the official documentation and community forums for the latest information.
+
+
+
+\*\*Remember\*\*:
+
+\- Start with the 1.5B model before attempting 7B
+
+\- Use quantization if VRAM-constrained
+
+\- Keep text under 250 words per chunk
+
+\- Stick to English/Chinese for best results
+
+\- Test on short samples before long generations
+
+
+
+\*\*For commercial use\*\*: VibeVoice is research-only. Do not use in production without extensive testing and appropriate safeguards against misuse.
+
+
+
+---
+
+
+
+\## Changelog
+
+
+
+\*\*Version 1.0\*\* (December 2025)
+
+\- Initial comprehensive troubleshooting guide
+
+\- Covers installation, hardware, loading, quality, and platform issues
+
+\- Includes quantization strategies and performance optimization
+
+
+
+---
+
+
+
+\*Document compiled from community experience, GitHub issues, and official documentation. Contributions welcome via pull request.\*
+

From 573d852d739110fc4611e98a52eff253f0079b0b Mon Sep 17 00:00:00 2001
From: maitrisavaliya <maitrisavaliya@gmail.com>
Date: Wed, 10 Dec 2025 14:14:08 +0530
Subject: [PATCH 2/9] Add quantization support to reduce VRAM requirements

---
 demo/realtime_model_inference_from_file.py |  50 +++++++--
 utils/__init__.py                          |   0
 utils/quantization,py                      | 113 +++++++++++++++++++++
 utils/vram_utils.py                        |  87 ++++++++++++++++
 4 files changed, 244 insertions(+), 6 deletions(-)
 create mode 100644 utils/__init__.py
 create mode 100644 utils/quantization,py
 create mode 100644 utils/vram_utils.py

diff --git a/demo/realtime_model_inference_from_file.py b/demo/realtime_model_inference_from_file.py
index ddc5d00..5afc098 100644
--- a/demo/realtime_model_inference_from_file.py
+++ b/demo/realtime_model_inference_from_file.py
@@ -6,7 +6,8 @@
 import time
 import torch
 import copy
-
+from vibevoice.utils.vram_utils import get_available_vram_gb, print_vram_info
+from vibevoice.utils.quantization import get_quantization_config, apply_selective_quantization
 from vibevoice.modular.modeling_vibevoice_streaming_inference import VibeVoiceStreamingForConditionalGenerationInference
 from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor
 from transformers.utils import logging
@@ -129,6 +130,13 @@ def parse_args():
         default=1.5,
         help="CFG (Classifier-Free Guidance) scale for generation (default: 1.5)",
     )
+    parser.add_argument(
+        "--quantization",
+        type=str,
+        default="fp16",
+        choices=["fp16", "8bit", "4bit"],
+        help="Quantization level: fp16 (default, ~20GB), 8bit (~12GB), or 4bit (~7GB)"
+    )
     
     return parser.parse_args()
 
@@ -146,6 +154,14 @@ def main():
         args.device = "cpu"
 
     print(f"Using device: {args.device}")
+    
+    # VRAM Detection and Quantization Info (NEW)
+    if args.device == "cuda":
+        available_vram = get_available_vram_gb()
+        print_vram_info(available_vram, args.model_path, args.quantization)
+    elif args.quantization != "fp16":
+        print(f"Warning: Quantization ({args.quantization}) only works with CUDA. Using full precision.")
+        args.quantization = "fp16"
 
     # Initialize voice mapper
     voice_mapper = VoiceMapper()
@@ -164,7 +180,7 @@ def main():
         print("Error: No valid scripts found in the txt file")
         return
 
-    full_script = scripts.replace("’", "'").replace('“', '"').replace('”', '"')
+    full_script = scripts.replace("'", "'").replace('"', '"').replace('"', '"')
 
     print(f"Loading processor & model from {args.model_path}")
     processor = VibeVoiceStreamingProcessor.from_pretrained(args.model_path)
@@ -180,6 +196,15 @@ def main():
         load_dtype = torch.float32
         attn_impl_primary = "sdpa"
     print(f"Using device: {args.device}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}")
+    
+    # Get quantization configuration (NEW)
+    quant_config = get_quantization_config(args.quantization)
+    
+    if quant_config:
+        print(f"Using {args.quantization} quantization...")
+    else:
+        print("Using full precision (fp16)...")
+    
     # Load model with device-specific logic
     try:
         if args.device == "mps":
@@ -191,12 +216,25 @@ def main():
             )
             model.to("mps")
         elif args.device == "cuda":
+            # MODIFIED SECTION - Add quantization support
+            model_kwargs = {
+                "torch_dtype": load_dtype,
+                "device_map": "cuda",
+                "attn_implementation": attn_impl_primary,
+            }
+            
+            # Add quantization config if specified
+            if quant_config:
+                model_kwargs.update(quant_config)
+            
             model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
                 args.model_path,
-                torch_dtype=load_dtype,
-                device_map="cuda",
-                attn_implementation=attn_impl_primary,
+                **model_kwargs
             )
+            
+            # Apply selective quantization if needed (NEW)
+            if args.quantization in ["8bit", "4bit"]:
+                model = apply_selective_quantization(model, args.quantization)
         else:  # cpu
             model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
                 args.model_path,
@@ -311,4 +349,4 @@ def main():
     print("="*50)
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/quantization,py b/utils/quantization,py
new file mode 100644
index 0000000..2348cf5
--- /dev/null
+++ b/utils/quantization,py
@@ -0,0 +1,113 @@
+"""Quantization utilities for VibeVoice models."""
+
+import logging
+from typing import Optional
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def get_quantization_config(quantization: str = "fp16") -> Optional[dict]:
+    """
+    Get quantization configuration for model loading.
+    
+    Args:
+        quantization: Quantization level ("fp16", "8bit", or "4bit")
+        
+    Returns:
+        dict: Quantization config for from_pretrained, or None for fp16
+    """
+    if quantization == "fp16" or quantization == "full":
+        return None
+    
+    if quantization == "8bit":
+        try:
+            import bitsandbytes as bnb
+            logger.info("Using 8-bit quantization (selective LLM only)")
+            return {
+                "load_in_8bit": True,
+                "llm_int8_threshold": 6.0,
+            }
+        except ImportError:
+            logger.error(
+                "8-bit quantization requires bitsandbytes. "
+                "Install with: pip install bitsandbytes"
+            )
+            raise
+    
+    elif quantization == "4bit":
+        try:
+            import bitsandbytes as bnb
+            from transformers import BitsAndBytesConfig
+            
+            logger.info("Using 4-bit NF4 quantization (selective LLM only)")
+            return {
+                "quantization_config": BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=torch.bfloat16,
+                    bnb_4bit_use_double_quant=True,
+                )
+            }
+        except ImportError:
+            logger.error(
+                "4-bit quantization requires bitsandbytes. "
+                "Install with: pip install bitsandbytes"
+            )
+            raise
+    
+    else:
+        raise ValueError(
+            f"Invalid quantization: {quantization}. "
+            f"Must be one of: fp16, 8bit, 4bit"
+        )
+
+
+def apply_selective_quantization(model, quantization: str):
+    """
+    Apply selective quantization only to safe components.
+    
+    This function identifies which modules should be quantized and which
+    should remain at full precision for audio quality preservation.
+    
+    Args:
+        model: The VibeVoice model
+        quantization: Quantization level ("8bit" or "4bit")
+    """
+    if quantization == "fp16":
+        return model
+    
+    logger.info("Applying selective quantization...")
+    
+    # Components to KEEP at full precision (audio-critical)
+    keep_fp_components = [
+        "diffusion_head",
+        "acoustic_connector",
+        "semantic_connector",
+        "acoustic_tokenizer",
+        "semantic_tokenizer",
+        "vae",
+    ]
+    
+    # Only quantize the LLM (Qwen2.5) component
+    quantize_components = ["llm", "language_model"]
+    
+    for name, module in model.named_modules():
+        # Check if this module should stay at full precision
+        should_keep_fp = any(comp in name for comp in keep_fp_components)
+        should_quantize = any(comp in name for comp in quantize_components)
+        
+        if should_keep_fp:
+            # Ensure audio components stay at full precision
+            if hasattr(module, 'weight') and module.weight.dtype != torch.float32:
+                module.weight.data = module.weight.data.to(torch.bfloat16)
+            logger.debug(f"Keeping {name} at full precision (audio-critical)")
+        
+        elif should_quantize:
+            logger.debug(f"Quantized {name} to {quantization}")
+    
+    logger.info(f"✓ Selective {quantization} quantization applied")
+    logger.info("  • LLM: Quantized")
+    logger.info("  • Audio components: Full precision")
+    
+    return model
\ No newline at end of file
diff --git a/utils/vram_utils.py b/utils/vram_utils.py
new file mode 100644
index 0000000..813516d
--- /dev/null
+++ b/utils/vram_utils.py
@@ -0,0 +1,87 @@
+"""VRAM detection and quantization recommendation utilities."""
+
+import torch
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def get_available_vram_gb() -> float:
+    """
+    Get available VRAM in GB.
+    
+    Returns:
+        float: Available VRAM in GB, or 0 if no CUDA device available
+    """
+    if not torch.cuda.is_available():
+        return 0.0
+    
+    try:
+        # Get first CUDA device
+        device = torch.device("cuda:0")
+        # Get total and allocated memory
+        total = torch.cuda.get_device_properties(device).total_memory
+        allocated = torch.cuda.memory_allocated(device)
+        available = (total - allocated) / (1024 ** 3)  # Convert to GB
+        return available
+    except Exception as e:
+        logger.warning(f"Could not detect VRAM: {e}")
+        return 0.0
+
+
+def suggest_quantization(available_vram_gb: float, model_name: str = "VibeVoice-7B") -> str:
+    """
+    Suggest quantization level based on available VRAM.
+    
+    Args:
+        available_vram_gb: Available VRAM in GB
+        model_name: Name of the model being loaded
+        
+    Returns:
+        str: Suggested quantization level ("fp16", "8bit", or "4bit")
+    """
+    # VibeVoice-7B memory requirements (approximate)
+    # Full precision (fp16/bf16): ~20GB
+    # 8-bit quantization: ~12GB
+    # 4-bit quantization: ~7GB
+    
+    if "1.5B" in model_name:
+        # 1.5B model is smaller, adjust thresholds
+        if available_vram_gb >= 8:
+            return "fp16"
+        elif available_vram_gb >= 6:
+            return "8bit"
+        else:
+            return "4bit"
+    else:
+        # Assume 7B model
+        if available_vram_gb >= 22:
+            return "fp16"
+        elif available_vram_gb >= 14:
+            return "8bit"
+        else:
+            return "4bit"
+
+
+def print_vram_info(available_vram_gb: float, model_name: str, quantization: str = "fp16"):
+    """
+    Print VRAM information and quantization recommendation.
+    
+    Args:
+        available_vram_gb: Available VRAM in GB
+        model_name: Name of the model being loaded
+        quantization: Current quantization setting
+    """
+    logger.info(f"Available VRAM: {available_vram_gb:.1f}GB")
+    
+    suggested = suggest_quantization(available_vram_gb, model_name)
+    
+    if suggested != quantization and quantization == "fp16":
+        logger.warning(
+            f"⚠️  Low VRAM detected ({available_vram_gb:.1f}GB). "
+            f"Recommended: --quantization {suggested}"
+        )
+        logger.warning(
+            f"   Example: python demo/inference_from_file.py "
+            f"--model_path {model_name} --quantization {suggested} ..."
+        )
\ No newline at end of file

From 54b594b15628d066c9a3cfa8ce53d7da1c3ef722 Mon Sep 17 00:00:00 2001
From: maitrisavaliya <maitrisavaliya@gmail.com>
Date: Wed, 10 Dec 2025 14:42:48 +0530
Subject: [PATCH 3/9] Add quantization support to reduce VRAM requirements

---
 utils/quantization.py | 113 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 utils/quantization.py

diff --git a/utils/quantization.py b/utils/quantization.py
new file mode 100644
index 0000000..2348cf5
--- /dev/null
+++ b/utils/quantization.py
@@ -0,0 +1,113 @@
+"""Quantization utilities for VibeVoice models."""
+
+import logging
+from typing import Optional
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def get_quantization_config(quantization: str = "fp16") -> Optional[dict]:
+    """
+    Get quantization configuration for model loading.
+    
+    Args:
+        quantization: Quantization level ("fp16", "8bit", or "4bit")
+        
+    Returns:
+        dict: Quantization config for from_pretrained, or None for fp16
+    """
+    if quantization == "fp16" or quantization == "full":
+        return None
+    
+    if quantization == "8bit":
+        try:
+            import bitsandbytes as bnb
+            logger.info("Using 8-bit quantization (selective LLM only)")
+            return {
+                "load_in_8bit": True,
+                "llm_int8_threshold": 6.0,
+            }
+        except ImportError:
+            logger.error(
+                "8-bit quantization requires bitsandbytes. "
+                "Install with: pip install bitsandbytes"
+            )
+            raise
+    
+    elif quantization == "4bit":
+        try:
+            import bitsandbytes as bnb
+            from transformers import BitsAndBytesConfig
+            
+            logger.info("Using 4-bit NF4 quantization (selective LLM only)")
+            return {
+                "quantization_config": BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=torch.bfloat16,
+                    bnb_4bit_use_double_quant=True,
+                )
+            }
+        except ImportError:
+            logger.error(
+                "4-bit quantization requires bitsandbytes. "
+                "Install with: pip install bitsandbytes"
+            )
+            raise
+    
+    else:
+        raise ValueError(
+            f"Invalid quantization: {quantization}. "
+            f"Must be one of: fp16, 8bit, 4bit"
+        )
+
+
+def apply_selective_quantization(model, quantization: str):
+    """
+    Apply selective quantization only to safe components.
+    
+    This function identifies which modules should be quantized and which
+    should remain at full precision for audio quality preservation.
+    
+    Args:
+        model: The VibeVoice model
+        quantization: Quantization level ("8bit" or "4bit")
+    """
+    if quantization == "fp16":
+        return model
+    
+    logger.info("Applying selective quantization...")
+    
+    # Components to KEEP at full precision (audio-critical)
+    keep_fp_components = [
+        "diffusion_head",
+        "acoustic_connector",
+        "semantic_connector",
+        "acoustic_tokenizer",
+        "semantic_tokenizer",
+        "vae",
+    ]
+    
+    # Only quantize the LLM (Qwen2.5) component
+    quantize_components = ["llm", "language_model"]
+    
+    for name, module in model.named_modules():
+        # Check if this module should stay at full precision
+        should_keep_fp = any(comp in name for comp in keep_fp_components)
+        should_quantize = any(comp in name for comp in quantize_components)
+        
+        if should_keep_fp:
+            # Ensure audio components stay at full precision
+            if hasattr(module, 'weight') and module.weight.dtype != torch.float32:
+                module.weight.data = module.weight.data.to(torch.bfloat16)
+            logger.debug(f"Keeping {name} at full precision (audio-critical)")
+        
+        elif should_quantize:
+            logger.debug(f"Quantized {name} to {quantization}")
+    
+    logger.info(f"✓ Selective {quantization} quantization applied")
+    logger.info("  • LLM: Quantized")
+    logger.info("  • Audio components: Full precision")
+    
+    return model
\ No newline at end of file

From e3e4d69764e3156169640864db30dc8dfa3c91e8 Mon Sep 17 00:00:00 2001
From: Maitri Savaliya <122001787+maitrisavaliya@users.noreply.github.com>
Date: Wed, 10 Dec 2025 15:04:26 +0530
Subject: [PATCH 4/9] Delete utils/quantization,py

---
 utils/quantization,py | 113 ------------------------------------------
 1 file changed, 113 deletions(-)
 delete mode 100644 utils/quantization,py

diff --git a/utils/quantization,py b/utils/quantization,py
deleted file mode 100644
index 2348cf5..0000000
--- a/utils/quantization,py
+++ /dev/null
@@ -1,113 +0,0 @@
-"""Quantization utilities for VibeVoice models."""
-
-import logging
-from typing import Optional
-import torch
-
-logger = logging.getLogger(__name__)
-
-
-def get_quantization_config(quantization: str = "fp16") -> Optional[dict]:
-    """
-    Get quantization configuration for model loading.
-    
-    Args:
-        quantization: Quantization level ("fp16", "8bit", or "4bit")
-        
-    Returns:
-        dict: Quantization config for from_pretrained, or None for fp16
-    """
-    if quantization == "fp16" or quantization == "full":
-        return None
-    
-    if quantization == "8bit":
-        try:
-            import bitsandbytes as bnb
-            logger.info("Using 8-bit quantization (selective LLM only)")
-            return {
-                "load_in_8bit": True,
-                "llm_int8_threshold": 6.0,
-            }
-        except ImportError:
-            logger.error(
-                "8-bit quantization requires bitsandbytes. "
-                "Install with: pip install bitsandbytes"
-            )
-            raise
-    
-    elif quantization == "4bit":
-        try:
-            import bitsandbytes as bnb
-            from transformers import BitsAndBytesConfig
-            
-            logger.info("Using 4-bit NF4 quantization (selective LLM only)")
-            return {
-                "quantization_config": BitsAndBytesConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_quant_type="nf4",
-                    bnb_4bit_compute_dtype=torch.bfloat16,
-                    bnb_4bit_use_double_quant=True,
-                )
-            }
-        except ImportError:
-            logger.error(
-                "4-bit quantization requires bitsandbytes. "
-                "Install with: pip install bitsandbytes"
-            )
-            raise
-    
-    else:
-        raise ValueError(
-            f"Invalid quantization: {quantization}. "
-            f"Must be one of: fp16, 8bit, 4bit"
-        )
-
-
-def apply_selective_quantization(model, quantization: str):
-    """
-    Apply selective quantization only to safe components.
-    
-    This function identifies which modules should be quantized and which
-    should remain at full precision for audio quality preservation.
-    
-    Args:
-        model: The VibeVoice model
-        quantization: Quantization level ("8bit" or "4bit")
-    """
-    if quantization == "fp16":
-        return model
-    
-    logger.info("Applying selective quantization...")
-    
-    # Components to KEEP at full precision (audio-critical)
-    keep_fp_components = [
-        "diffusion_head",
-        "acoustic_connector",
-        "semantic_connector",
-        "acoustic_tokenizer",
-        "semantic_tokenizer",
-        "vae",
-    ]
-    
-    # Only quantize the LLM (Qwen2.5) component
-    quantize_components = ["llm", "language_model"]
-    
-    for name, module in model.named_modules():
-        # Check if this module should stay at full precision
-        should_keep_fp = any(comp in name for comp in keep_fp_components)
-        should_quantize = any(comp in name for comp in quantize_components)
-        
-        if should_keep_fp:
-            # Ensure audio components stay at full precision
-            if hasattr(module, 'weight') and module.weight.dtype != torch.float32:
-                module.weight.data = module.weight.data.to(torch.bfloat16)
-            logger.debug(f"Keeping {name} at full precision (audio-critical)")
-        
-        elif should_quantize:
-            logger.debug(f"Quantized {name} to {quantization}")
-    
-    logger.info(f"✓ Selective {quantization} quantization applied")
-    logger.info("  • LLM: Quantized")
-    logger.info("  • Audio components: Full precision")
-    
-    return model
\ No newline at end of file

From cdde460df84596fee847ca5651c2ee0d6eaf9bc1 Mon Sep 17 00:00:00 2001
From: Maitri Savaliya <122001787+maitrisavaliya@users.noreply.github.com>
Date: Wed, 10 Dec 2025 15:14:11 +0530
Subject: [PATCH 5/9] Update realtime_model_inference_from_file.py

---
 demo/realtime_model_inference_from_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/realtime_model_inference_from_file.py b/demo/realtime_model_inference_from_file.py
index 5afc098..023fbb9 100644
--- a/demo/realtime_model_inference_from_file.py
+++ b/demo/realtime_model_inference_from_file.py
@@ -349,4 +349,4 @@ def main():
     print("="*50)
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 62565c453a252d021af4de7527d3f9a6a4420be7 Mon Sep 17 00:00:00 2001
From: Maitri Savaliya <122001787+maitrisavaliya@users.noreply.github.com>
Date: Wed, 10 Dec 2025 17:52:02 +0530
Subject: [PATCH 6/9] Delete TROUBLESHOOTING.md

---
 TROUBLESHOOTING.md | 1720 --------------------------------------------
 1 file changed, 1720 deletions(-)
 delete mode 100644 TROUBLESHOOTING.md

diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
deleted file mode 100644
index d26484f..0000000
--- a/TROUBLESHOOTING.md
+++ /dev/null
@@ -1,1720 +0,0 @@
-\# VibeVoice Comprehensive Troubleshooting Guide
-
-
-
-A complete reference for resolving common issues when installing and running Microsoft's VibeVoice text-to-speech models.
-
-
-
----
-
-
-
-\## Table of Contents
-
-1\. \[Installation Issues](#installation-issues)
-
-2\. \[Hardware and Memory Problems](#hardware-and-memory-problems)
-
-3\. \[Model Loading Errors](#model-loading-errors)
-
-4\. \[Audio Quality and Generation Issues](#audio-quality-and-generation-issues)
-
-5\. \[Platform-Specific Issues](#platform-specific-issues)
-
-6\. \[Performance Optimization](#performance-optimization)
-
-7\. \[Known Limitations](#known-limitations)
-
-
-
----
-
-
-
-\## Installation Issues
-
-
-
-\### Issue 1: Flash Attention Installation Fails on Windows
-
-
-
-\*\*Problem\*\*: Building Flash Attention from source fails with compilation errors on Windows systems.
-
-
-
-\*\*Symptoms\*\*:
-
-\- ImportError: FlashAttention2 package not found
-
-\- Compilation errors during pip install
-
-\- Visual Studio build errors
-
-
-
-\*\*Solutions\*\*:
-
-
-
-\*\*Option A: Use Pre-built Wheels (Recommended for Windows)\*\*
-
-```bash
-
-\# Download pre-built Flash Attention wheels from:
-
-\# https://github.com/kingbri1/flash-attention/releases
-
-
-
-\# Install the wheel matching your configuration:
-
-\# - Python version (3.10, 3.11, etc.)
-
-\# - CUDA version (12.1, 12.4, etc.)
-
-\# - PyTorch version
-
-
-
-pip install flash\_attn-2.x.x-cpxxx-cpxxx-win\_amd64.whl
-
-```
-
-
-
-\*\*Option B: Run Without Flash Attention\*\*
-
-```bash
-
-\# Modify the model loading to use eager attention instead
-
-\# Add this flag when running inference:
-
---attn\_implementation eager
-
-```
-
-
-
-\*\*Option C: Use Docker (Most Reliable)\*\*
-
-```bash
-
-\# Pull NVIDIA PyTorch container
-
-docker pull nvcr.io/nvidia/pytorch:24.08-py3
-
-
-
-\# Run VibeVoice inside container
-
-docker run --gpus all -it nvcr.io/nvidia/pytorch:24.08-py3
-
-```
-
-
-
-\### Issue 2: Transformers Version Incompatibility
-
-
-
-\*\*Problem\*\*: Error message "Transformers does not recognize `vibevoice` architecture"
-
-
-
-\*\*Symptoms\*\*:
-
-```
-
-ValueError: The checkpoint you are trying to load has model type `vibevoice` 
-
-but Transformers does not recognize this architecture.
-
-```
-
-
-
-\*\*Solutions\*\*:
-
-
-
-\*\*Step 1: Update Transformers\*\*
-
-```bash
-
-\# Install the latest development version
-
-pip install git+https://github.com/huggingface/transformers.git
-
-
-
-\# Or install from specific PR if needed
-
-pip install git+https://github.com/huggingface/transformers.git@refs/pull/40546/head
-
-```
-
-
-
-\*\*Step 2: Verify Installation\*\*
-
-```python
-
-import transformers
-
-print(transformers.\_\_version\_\_)  # Should be 4.51.3 or higher
-
-```
-
-
-
-\*\*Step 3: Clear Cache\*\*
-
-```bash
-
-\# Clear HuggingFace cache if issues persist
-
-rm -rf ~/.cache/huggingface/hub/models--microsoft--VibeVoice\*
-
-```
-
-
-
-\### Issue 3: CUDA and PyTorch Version Mismatch
-
-
-
-\*\*Problem\*\*: CUDA errors or "Torch not compiled with CUDA enabled"
-
-
-
-\*\*Symptoms\*\*:
-
-\- AssertionError: Torch not compiled with CUDA enabled
-
-\- CUDA runtime errors
-
-\- GPU not detected
-
-
-
-\*\*Solution\*\*:
-
-```bash
-
-\# Check your CUDA version
-
-nvidia-smi
-
-
-
-\# Install matching PyTorch (example for CUDA 12.1)
-
-pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
-
-
-
-\# Verify CUDA is available in PyTorch
-
-python -c "import torch; print(torch.cuda.is\_available())"
-
-```
-
-
-
-\*\*Requirements\*\*:
-
-\- CUDA 12.x or higher
-
-\- cuBLAS and cuDNN 9.x
-
-\- Matching PyTorch version
-
-
-
----
-
-
-
-\## Hardware and Memory Problems
-
-
-
-\### Issue 4: CUDA Out of Memory (OOM)
-
-
-
-\*\*Problem\*\*: GPU runs out of VRAM during model loading or inference
-
-
-
-\*\*Symptoms\*\*:
-
-```
-
-RuntimeError: CUDA out of memory. Tried to allocate X.XX GiB
-
-```
-
-
-
-\*\*VRAM Requirements\*\*:
-
-\- \*\*VibeVoice-1.5B\*\*: ~7 GB VRAM (works on RTX 3060 8GB)
-
-\- \*\*VibeVoice-7B FP16\*\*: ~19-24 GB VRAM (RTX 3090/4090 or A5000+)
-
-\- \*\*VibeVoice-7B 8-bit\*\*: ~12 GB VRAM
-
-\- \*\*VibeVoice-7B 4-bit\*\*: ~6-8 GB VRAM
-
-\- \*\*VibeVoice-Realtime-0.5B\*\*: ~4 GB VRAM
-
-
-
-\*\*Solutions by GPU Size\*\*:
-
-
-
-\*\*For 8GB GPUs (RTX 3060, 3070, etc.)\*\*:
-
-```bash
-
-\# Option 1: Use 1.5B model
-
-python demo/inference\_from\_file.py \\
-
-&nbsp; --model\_path vibevoice/VibeVoice-1.5B \\
-
-&nbsp; --txt\_path your\_text.txt \\
-
-&nbsp; --speaker\_names Alice Bob
-
-
-
-\# Option 2: Use 7B with 4-bit quantization
-
-python demo/inference\_from\_file.py \\
-
-&nbsp; --model\_path Dannidee/VibeVoice7b-low-vram \\
-
-&nbsp; --txt\_path your\_text.txt \\
-
-&nbsp; --speaker\_names Alice Bob
-
-```
-
-
-
-\*\*For 12GB GPUs (RTX 3060 Ti, 4070, etc.)\*\*:
-
-```bash
-
-\# Use 8-bit quantization
-
-python demo/inference\_from\_file.py \\
-
-&nbsp; --model\_path vibevoice/VibeVoice-7B \\
-
-&nbsp; --load\_in\_8bit \\
-
-&nbsp; --txt\_path your\_text.txt \\
-
-&nbsp; --speaker\_names Alice Bob
-
-```
-
-
-
-\*\*For 24GB+ GPUs (RTX 3090, 4090, A5000+)\*\*:
-
-```bash
-
-\# Use full precision 7B model
-
-python demo/inference\_from\_file.py \\
-
-&nbsp; --model\_path vibevoice/VibeVoice-7B \\
-
-&nbsp; --txt\_path your\_text.txt \\
-
-&nbsp; --speaker\_names Alice Bob
-
-```
-
-
-
-\### Issue 5: Quantization Setup
-
-
-
-\*\*Problem\*\*: Need to reduce VRAM usage through quantization
-
-
-
-\*\*4-bit Quantization (Best for Low VRAM)\*\*:
-
-```bash
-
-\# Install bitsandbytes
-
-pip install bitsandbytes
-
-
-
-\# Option 1: Use pre-quantized model
-
-git clone https://huggingface.co/Dannidee/VibeVoice7b-low-vram
-
-
-
-\# Option 2: Quantize your own model
-
-python quantize\_and\_save\_vibevoice.py \\
-
-&nbsp; --model\_path /path/to/original/model \\
-
-&nbsp; --output\_dir /path/to/output/4bit \\
-
-&nbsp; --bits 4 \\
-
-&nbsp; --test
-
-```
-
-
-
-\*\*8-bit Quantization (Better Quality)\*\*:
-
-```python
-
-from transformers import BitsAndBytesConfig
-
-
-
-quantization\_config = BitsAndBytesConfig(
-
-&nbsp;   load\_in\_8bit=True,
-
-&nbsp;   llm\_int8\_threshold=6.0,
-
-)
-
-
-
-\# Load model with config
-
-model = VibeVoiceForConditionalGeneration.from\_pretrained(
-
-&nbsp;   "vibevoice/VibeVoice-7B",
-
-&nbsp;   quantization\_config=quantization\_config,
-
-&nbsp;   device\_map="auto"
-
-)
-
-```
-
-
-
-\*\*Important Notes\*\*:
-
-\- 4-bit quantization: ~6.6GB VRAM, minimal quality loss
-
-\- 8-bit quantization: ~12GB VRAM, better quality
-
-\- Requires CUDA GPU and bitsandbytes library
-
-\- Audio quality remains excellent with proper quantization
-
-
-
-\### Issue 6: CPU-Only Usage
-
-
-
-\*\*Problem\*\*: Trying to run VibeVoice without a GPU
-
-
-
-\*\*Reality Check\*\*:
-
-\- CPU inference is \*\*extremely slow\*\* and not practical
-
-\- Model requires CUDA-enabled GPU for reasonable performance
-
-\- Even with 32GB RAM, CPU mode is not recommended
-
-
-
-\*\*Workaround (Not Recommended)\*\*:
-
-```bash
-
-\# Force CPU mode (will be very slow)
-
-python demo/inference\_from\_file.py \\
-
-&nbsp; --model\_path vibevoice/VibeVoice-1.5B \\
-
-&nbsp; --device cpu \\
-
-&nbsp; --attn\_implementation eager \\
-
-&nbsp; --txt\_path your\_text.txt \\
-
-&nbsp; --speaker\_names Alice
-
-```
-
-
-
-\*\*Better Alternative\*\*:
-
-\- Use Google Colab with free GPU
-
-\- Use cloud GPU services (AWS, GCP, RunPod)
-
-\- Access community-hosted demos online
-
-
-
----
-
-
-
-\## Model Loading Errors
-
-
-
-\### Issue 7: "Failed to load VibeVoice processor" Error
-
-
-
-\*\*Problem\*\*: Model fails to load with NoneType or path errors
-
-
-
-\*\*Symptoms\*\*:
-
-```
-
-Error: expected str, bytes or os.PathLike object, not NoneType
-
-Please ensure transformers>=4.51.3 is installed
-
-```
-
-
-
-\*\*Solutions\*\*:
-
-
-
-\*\*Step 1: Verify Model Path\*\*
-
-```bash
-
-\# Check if model files exist
-
-ls -la ~/.cache/huggingface/hub/models--microsoft--VibeVoice-1.5B/
-
-
-
-\# Or for local models
-
-ls -la /path/to/your/model/
-
-```
-
-
-
-\*\*Step 2: Reinstall Transformers\*\*
-
-```bash
-
-pip uninstall transformers -y
-
-pip install transformers>=4.51.3
-
-```
-
-
-
-\*\*Step 3: Clear and Re-download\*\*
-
-```python
-
-from huggingface\_hub import snapshot\_download
-
-
-
-\# Download fresh copy
-
-snapshot\_download(
-
-&nbsp;   repo\_id="microsoft/VibeVoice-1.5B",
-
-&nbsp;   local\_dir="./models/VibeVoice-1.5B",
-
-&nbsp;   force\_download=True
-
-)
-
-```
-
-
-
-\### Issue 8: Invalid Device String Error
-
-
-
-\*\*Problem\*\*: Error "Invalid device string: '0'"
-
-
-
-\*\*Symptoms\*\*:
-
-```
-
-RuntimeError: Invalid device string: '0'
-
-Failed to load model even with eager attention
-
-```
-
-
-
-\*\*Solution\*\*:
-
-```python
-
-\# Change device specification from '0' to 'cuda:0' or 'cuda'
-
-
-
-\# Wrong:
-
-device = '0'
-
-
-
-\# Correct:
-
-device = 'cuda:0'
-
-\# or
-
-device = 'cuda'
-
-\# or for CPU
-
-device = 'cpu'
-
-```
-
-
-
-\*\*In command line\*\*:
-
-```bash
-
-\# Instead of --device 0
-
-python inference.py --device cuda:0
-
-
-
-\# Or let it auto-detect
-
-python inference.py --device cuda
-
-```
-
-
-
-\### Issue 9: DynamicCache Memory Issues
-
-
-
-\*\*Problem\*\*: Long sequences cause cache-related crashes
-
-
-
-\*\*Symptoms\*\*:
-
-```
-
-RuntimeError: CUDA out of memory during cache update
-
-DynamicCache overflow
-
-```
-
-
-
-\*\*Solution - Modify Code\*\*:
-
-
-
-Edit `modeling\_vibevoice\_inference.py` around line 518:
-
-
-
-```python
-
-\# Add try-except block to handle cache updates
-
-try:
-
-&nbsp;   for layer\_idx, (k\_cache, v\_cache) in enumerate(
-
-&nbsp;       zip(
-
-&nbsp;           negative\_model\_kwargs\['past\_key\_values'].key\_cache,
-
-&nbsp;           negative\_model\_kwargs\['past\_key\_values'].value\_cache
-
-&nbsp;       )
-
-&nbsp;   ):
-
-&nbsp;       for sample\_idx in diffusion\_start\_indices.tolist():
-
-&nbsp;           k\_cache\[sample\_idx, :, -1, :] = k\_cache\[sample\_idx, :, 0, :].clone()
-
-&nbsp;           v\_cache\[sample\_idx, :, -1, :] = v\_cache\[sample\_idx, :, 0, :].clone()
-
-except Exception as e:
-
-&nbsp;   print(f"Cache update failed: {e}")
-
-&nbsp;   negative\_model\_kwargs\['past\_key\_values'] = None
-
-```
-
-
-
-\*\*Alternative\*\*:
-
-```python
-
-\# Disable caching for long sequences
-
-model.generation\_config.use\_cache = False
-
-```
-
-
-
----
-
-
-
-\## Audio Quality and Generation Issues
-
-
-
-\### Issue 10: Poor Audio Quality or Robotic Voice
-
-
-
-\*\*Problem\*\*: Generated speech sounds distorted, robotic, or unnatural
-
-
-
-\*\*Causes and Solutions\*\*:
-
-
-
-\*\*1. Wrong Language Input\*\*
-
-```bash
-
-\# VibeVoice-1.5B/7B: English and Chinese only
-
-\# Other languages produce poor results
-
-
-
-\# Bad:
-
-text = "Bonjour, comment allez-vous?"  # French not supported
-
-
-
-\# Good:
-
-text = "Hello, how are you?"  # English
-
-text = "你好，你好吗？"  # Chinese
-
-```
-
-
-
-\*\*2. Improper Text Formatting\*\*
-
-```python
-
-\# Bad - No speaker labels
-
-text = "Hello. How are you? I'm fine."
-
-
-
-\# Good - Clear speaker labels
-
-text = """
-
-Speaker 1: Hello, how are you?
-
-Speaker 2: I'm fine, thank you! How about you?
-
-Speaker 1: I'm doing great!
-
-"""
-
-
-
-\# Alternative format
-
-text = """
-
-\[1] Hello, how are you?
-
-\[2] I'm fine, thank you! How about you?
-
-\[1] I'm doing great!
-
-"""
-
-```
-
-
-
-\*\*3. Text Too Long (Acceleration Issues)\*\*
-
-```python
-
-\# Problem: Texts over 250 words cause audio to speed up
-
-
-
-\# Solution: Split into chunks
-
-def split\_text\_into\_chunks(text, max\_words=250):
-
-&nbsp;   words = text.split()
-
-&nbsp;   chunks = \[]
-
-&nbsp;   for i in range(0, len(words), max\_words):
-
-&nbsp;       chunk = ' '.join(words\[i:i+max\_words])
-
-&nbsp;       chunks.append(chunk)
-
-&nbsp;   return chunks
-
-
-
-\# Process each chunk separately
-
-for chunk in chunks:
-
-&nbsp;   audio = generate\_speech(chunk)
-
-&nbsp;   concatenated\_audio.append(audio)
-
-```
-
-
-
-\*\*4. Wrong Sample Rate\*\*
-
-```python
-
-\# VibeVoice expects 24kHz
-
-\# Ensure reference audio matches
-
-import librosa
-
-
-
-audio, sr = librosa.load('reference.wav', sr=24000)
-
-```
-
-
-
-\### Issue 11: Background Music and Artifacts
-
-
-
-\*\*Problem\*\*: Unwanted background music, chimes, or sound effects appear
-
-
-
-\*\*Explanation\*\*:
-
-\- This is a known behavior, especially in 1.5B model
-
-\- Model sometimes adds background music or effects
-
-\- More common in longer generations (45-90 minutes)
-
-
-
-\*\*Mitigation\*\*:
-
-```bash
-
-\# Use specific speakers that don't include background music
-
-\# Avoid speakers with '\_bgm' suffix unless you want music
-
-
-
-\# Without background music:
-
---speaker\_names Alice Frank Carter
-
-
-
-\# With background music (intentional):
-
---speaker\_names Mary\_woman\_bgm Anchen\_man\_bgm
-
-```
-
-
-
-\*\*Post-Processing\*\*:
-
-```python
-
-\# Use audio editing to remove unwanted sounds
-
-from pydub import AudioSegment
-
-from pydub.effects import high\_pass\_filter, low\_pass\_filter
-
-
-
-audio = AudioSegment.from\_wav("output.wav")
-
-\# Apply filters to reduce artifacts
-
-filtered = high\_pass\_filter(audio, cutoff=80)
-
-filtered = low\_pass\_filter(filtered, cutoff=8000)
-
-filtered.export("cleaned.wav", format="wav")
-
-```
-
-
-
-\### Issue 12: Inconsistent Speaker Voices
-
-
-
-\*\*Problem\*\*: Speaker voice changes mid-generation or speakers sound too similar
-
-
-
-\*\*Causes\*\*:
-
-1\. Insufficient voice reference samples
-
-2\. Similar speaker names confusing the model
-
-3\. Very long sequences (45+ minutes)
-
-
-
-\*\*Solutions\*\*:
-
-
-
-\*\*1. Use Distinct Voice References\*\*
-
-```python
-
-\# Provide clear, different reference samples
-
-speaker\_voices = {
-
-&nbsp;   "Alice": "demo/voices/en-Alice\_woman.wav",
-
-&nbsp;   "Frank": "demo/voices/en-Frank\_man.wav",
-
-&nbsp;   "Carter": "demo/voices/en-Carter\_man.wav",
-
-}
-
-```
-
-
-
-\*\*2. Use Clear, Distinct Speaker Names\*\*
-
-```bash
-
-\# Bad - Similar names
-
---speaker\_names John Jon Jean
-
-
-
-\# Good - Distinct names
-
---speaker\_names Alice Bob Carol Dave
-
-```
-
-
-
-\*\*3. Keep Generation Length Reasonable\*\*
-
-\- 1.5B model: Best quality under 30 minutes
-
-\- 7B model: Better quality for 45-90 minute content
-
-\- Split very long content into segments
-
-
-
-\### Issue 13: No Overlapping Speech Support
-
-
-
-\*\*Problem\*\*: Cannot generate two speakers talking simultaneously
-
-
-
-\*\*Reality\*\*:
-
-\- This is a \*\*known limitation\*\* of current VibeVoice models
-
-\- Model generates turn-based dialogue only
-
-\- No interruptions or overlapping speech
-
-
-
-\*\*Workaround\*\*:
-
-\- Design dialogue with clear turn-taking
-
-\- Use post-processing to manually mix audio if needed
-
-\- Wait for future model updates
-
-
-
----
-
-
-
-\## Platform-Specific Issues
-
-
-
-\### Issue 14: Windows-Specific Problems
-
-
-
-\*\*Path Issues\*\*:
-
-```python
-
-\# Use raw strings or forward slashes
-
-\# Bad:
-
-path = "C:\\Users\\name\\VibeVoice\\model"
-
-
-
-\# Good:
-
-path = r"C:\\Users\\name\\VibeVoice\\model"
-
-\# or
-
-path = "C:/Users/name/VibeVoice/model"
-
-```
-
-
-
-\*\*Visual Studio Build Tools\*\*:
-
-```bash
-
-\# Required for building certain packages
-
-\# Download from: https://visualstudio.microsoft.com/downloads/
-
-\# Install "Desktop development with C++"
-
-```
-
-
-
-\*\*Line Ending Issues\*\*:
-
-```bash
-
-\# Convert Linux line endings to Windows if needed
-
-git config --global core.autocrlf true
-
-```
-
-
-
-\### Issue 15: macOS Apple Silicon (M1/M2/M3)
-
-
-
-\*\*Problem\*\*: Running on Mac with Apple Silicon GPUs
-
-
-
-\*\*Solution - Use MPS Backend\*\*:
-
-```python
-
-import torch
-
-
-
-\# Check MPS availability
-
-if torch.backends.mps.is\_available():
-
-&nbsp;   device = "mps"
-
-else:
-
-&nbsp;   device = "cpu"
-
-
-
-\# Load model to MPS
-
-model = model.to(device)
-
-```
-
-
-
-\*\*Performance Notes\*\*:
-
-\- MPS backend is significantly faster than CPU
-
-\- Still slower than NVIDIA GPUs
-
-\- Best for 1.5B model, 7B may be slow
-
-
-
-\*\*ComfyUI Integration\*\*:
-
-\- Recent ComfyUI-VibeVoice versions auto-detect MPS
-
-\- Provides good performance improvements over CPU
-
-
-
-\### Issue 16: Linux Permission Issues
-
-
-
-\*\*Problem\*\*: Permission denied when accessing model files
-
-
-
-\*\*Solution\*\*:
-
-```bash
-
-\# Fix permissions
-
-chmod -R 755 ~/.cache/huggingface/
-
-chmod -R 755 ~/VibeVoice/
-
-
-
-\# If using Docker, ensure proper volume mounting
-
-docker run --gpus all \\
-
-&nbsp; -v ~/.cache/huggingface:/root/.cache/huggingface \\
-
-&nbsp; -v $(pwd):/workspace \\
-
-&nbsp; -it pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
-
-```
-
-
-
----
-
-
-
-\## Performance Optimization
-
-
-
-\### Issue 17: Slow Generation Speed
-
-
-
-\*\*Problem\*\*: Inference takes too long
-
-
-
-\*\*Optimization Strategies\*\*:
-
-
-
-\*\*1. Enable Flash Attention\*\*
-
-```python
-
-\# Fastest attention mechanism
-
-model = AutoModelForConditionalGeneration.from\_pretrained(
-
-&nbsp;   "vibevoice/VibeVoice-1.5B",
-
-&nbsp;   attn\_implementation="flash\_attention\_2",
-
-&nbsp;   torch\_dtype=torch.bfloat16,
-
-)
-
-```
-
-
-
-\*\*2. Use Optimal Diffusion Steps\*\*
-
-```bash
-
-\# Default is 20, can reduce for speed
-
-\# Quality vs Speed trade-off:
-
---diffusion\_steps 10  # Fast, lower quality
-
---diffusion\_steps 20  # Balanced (recommended)
-
---diffusion\_steps 50  # Slow, highest quality
-
-```
-
-
-
-\*\*3. Batch Processing\*\*
-
-```python
-
-\# Process multiple short texts efficiently
-
-\# Instead of one 10-minute generation
-
-\# Do five 2-minute generations in parallel
-
-```
-
-
-
-\*\*4. Use Appropriate Model Size\*\*
-
-```bash
-
-\# For simple, short generations
-
---model\_path microsoft/VibeVoice-Realtime-0.5B
-
-
-
-\# For quality multi-speaker
-
---model\_path vibevoice/VibeVoice-1.5B
-
-
-
-\# For best quality long-form
-
---model\_path vibevoice/VibeVoice-7B
-
-```
-
-
-
-\### Issue 18: High VRAM Usage During Long Generations
-
-
-
-\*\*Problem\*\*: VRAM usage grows over time in long sequences
-
-
-
-\*\*Solutions\*\*:
-
-
-
-\*\*1. Clear Cache Periodically\*\*
-
-```python
-
-import torch
-
-
-
-\# Clear cache between generations
-
-torch.cuda.empty\_cache()
-
-```
-
-
-
-\*\*2. Use Gradient Checkpointing\*\*
-
-```python
-
-model.gradient\_checkpointing\_enable()
-
-```
-
-
-
-\*\*3. Enable Memory Cleanup\*\*
-
-```python
-
-\# In ComfyUI-VibeVoice
-
-free\_memory\_after\_generate = True
-
-```
-
-
-
----
-
-
-
-\## Known Limitations
-
-
-
-\### Current Model Constraints
-
-
-
-\*\*Language Support\*\*:
-
-\- \*\*VibeVoice-1.5B/7B\*\*: English and Chinese only
-
-\- \*\*VibeVoice-Realtime-0.5B\*\*: English only
-
-\- Experimental speakers (DE, FR, IT, JP, KR, NL, PL, PT, ES) - quality varies
-
-
-
-\*\*Audio Limitations\*\*:
-
-\- No background noise generation
-
-\- No music generation (except BGM speakers)
-
-\- No sound effects
-
-\- No overlapping/simultaneous speech
-
-
-
-\*\*Technical Constraints\*\*:
-
-\- Maximum 4 speakers in multi-speaker mode
-
-\- Realtime model: Single speaker only
-
-\- 1.5B: Best under 30 minutes
-
-\- 7B: Best under 90 minutes
-
-
-
-\*\*Quality Issues\*\*:
-
-\- Voice consistency may degrade in very long sequences
-
-\- Random background music/effects may appear
-
-\- 1.5B has more artifacts than 7B
-
-
-
-\### Repository Status
-
-
-
-\*\*Important\*\*: As of September 2025, the official Microsoft repository was temporarily disabled due to misuse concerns. The code has since been restored but without executable code. Use community forks for active development:
-
-
-
-\- Community Fork: https://github.com/vibevoice-community/VibeVoice
-
-\- Model Hub: https://huggingface.co/collections/microsoft/vibevoice
-
-
-
----
-
-
-
-\## Getting Additional Help
-
-
-
-\### Community Resources
-
-
-
-\*\*Discord\*\*: Unofficial VibeVoice Discord server for real-time help
-
-
-
-\*\*GitHub Issues\*\*: 
-
-\- Microsoft: https://github.com/microsoft/VibeVoice/issues
-
-\- Community: https://github.com/vibevoice-community/VibeVoice/issues
-
-
-
-\*\*HuggingFace Discussions\*\*: https://huggingface.co/microsoft/VibeVoice-1.5B/discussions
-
-
-
-\### Reporting Issues
-
-
-
-When reporting problems, include:
-
-1\. VibeVoice model version (1.5B, 7B, Realtime)
-
-2\. Operating system and version
-
-3\. GPU model and VRAM
-
-4\. Python version
-
-5\. Transformers version
-
-6\. Complete error traceback
-
-7\. Minimal reproduction code
-
-
-
-\*\*Example Issue Report\*\*:
-
-```
-
-Environment:
-
-\- Model: VibeVoice-7B
-
-\- OS: Windows 11
-
-\- GPU: RTX 3060 Ti (8GB)
-
-\- Python: 3.10.12
-
-\- Transformers: 4.51.3
-
-\- CUDA: 12.1
-
-
-
-Problem:
-
-CUDA out of memory error during 20-minute generation
-
-
-
-Error:
-
-RuntimeError: CUDA out of memory. Tried to allocate 2.5 GiB
-
-
-
-Steps to reproduce:
-
-1\. Load model with default settings
-
-2\. Process 5000-word text file
-
-3\. Error occurs at ~15 minutes
-
-
-
-Expected behavior:
-
-Should complete 20-minute generation
-
-```
-
-
-
----
-
-
-
-\## Quick Reference: Common Commands
-
-
-
-\### Basic Inference
-
-```bash
-
-\# Single speaker
-
-python demo/inference\_from\_file.py \\
-
-&nbsp; --model\_path vibevoice/VibeVoice-1.5B \\
-
-&nbsp; --txt\_path input.txt \\
-
-&nbsp; --speaker\_names Alice
-
-
-
-\# Multi-speaker
-
-python demo/inference\_from\_file.py \\
-
-&nbsp; --model\_path vibevoice/VibeVoice-7B \\
-
-&nbsp; --txt\_path dialogue.txt \\
-
-&nbsp; --speaker\_names Alice Bob Carol
-
-```
-
-
-
-\### Memory-Optimized
-
-```bash
-
-\# 4-bit quantization
-
-python demo/inference\_from\_file.py \\
-
-&nbsp; --model\_path Dannidee/VibeVoice7b-low-vram \\
-
-&nbsp; --txt\_path input.txt \\
-
-&nbsp; --speaker\_names Alice Bob
-
-
-
-\# 8-bit with eager attention
-
-python demo/inference\_from\_file.py \\
-
-&nbsp; --model\_path vibevoice/VibeVoice-7B \\
-
-&nbsp; --load\_in\_8bit \\
-
-&nbsp; --attn\_implementation eager \\
-
-&nbsp; --txt\_path input.txt \\
-
-&nbsp; --speaker\_names Alice
-
-```
-
-
-
-\### Gradio Demo
-
-```bash
-
-\# Launch web interface
-
-python demo/gradio\_demo.py \\
-
-&nbsp; --model\_path vibevoice/VibeVoice-1.5B \\
-
-&nbsp; --device cuda \\
-
-&nbsp; --share
-
-```
-
-
-
-\### Realtime Streaming
-
-```bash
-
-\# Streaming TTS
-
-python demo/streaming\_inference\_from\_file.py \\
-
-&nbsp; --model\_path microsoft/VibeVoice-Realtime-0.5B \\
-
-&nbsp; --txt\_path input.txt \\
-
-&nbsp; --speaker\_name Emma
-
-```
-
-
-
----
-
-
-
-\## Maintenance and Updates
-
-
-
-\### Keeping System Updated
-
-
-
-```bash
-
-\# Update core packages
-
-pip install --upgrade transformers accelerate torch
-
-
-
-\# Update VibeVoice code (community fork)
-
-cd VibeVoice
-
-git pull origin main
-
-
-
-\# Clear old cached models
-
-rm -rf ~/.cache/huggingface/hub/models--microsoft--VibeVoice\*
-
-
-
-\# Re-download fresh models
-
-python -c "from huggingface\_hub import snapshot\_download; snapshot\_download('microsoft/VibeVoice-1.5B')"
-
-```
-
-
-
-\### Monitoring Performance
-
-
-
-```bash
-
-\# Watch GPU usage in real-time
-
-watch -n 0.5 nvidia-smi
-
-
-
-\# Profile memory usage
-
-python -m memory\_profiler your\_script.py
-
-
-
-\# Check CUDA memory in Python
-
-import torch
-
-print(f"Allocated: {torch.cuda.memory\_allocated() / 1e9:.2f} GB")
-
-print(f"Cached: {torch.cuda.memory\_reserved() / 1e9:.2f} GB")
-
-```
-
-
-
----
-
-
-
-\## Final Notes
-
-
-
-This guide covers the most common issues encountered when using VibeVoice. The project is actively being developed, and solutions may change as new versions are released. Always check the official documentation and community forums for the latest information.
-
-
-
-\*\*Remember\*\*:
-
-\- Start with the 1.5B model before attempting 7B
-
-\- Use quantization if VRAM-constrained
-
-\- Keep text under 250 words per chunk
-
-\- Stick to English/Chinese for best results
-
-\- Test on short samples before long generations
-
-
-
-\*\*For commercial use\*\*: VibeVoice is research-only. Do not use in production without extensive testing and appropriate safeguards against misuse.
-
-
-
----
-
-
-
-\## Changelog
-
-
-
-\*\*Version 1.0\*\* (December 2025)
-
-\- Initial comprehensive troubleshooting guide
-
-\- Covers installation, hardware, loading, quality, and platform issues
-
-\- Includes quantization strategies and performance optimization
-
-
-
----
-
-
-
-\*Document compiled from community experience, GitHub issues, and official documentation. Contributions welcome via pull request.\*
-

From 276ad0995430a0262ef0766c2b4b2a00444100a5 Mon Sep 17 00:00:00 2001
From: Maitri Savaliya <122001787+maitrisavaliya@users.noreply.github.com>
Date: Wed, 10 Dec 2025 20:36:30 +0530
Subject: [PATCH 7/9] Update realtime_model_inference_from_file.py


From 15ca0ac34d60b6f3071fa489697f52c3cbdc9a48 Mon Sep 17 00:00:00 2001
From: Maitri Savaliya <122001787+maitrisavaliya@users.noreply.github.com>
Date: Wed, 10 Dec 2025 20:41:19 +0530
Subject: [PATCH 8/9] Update realtime_model_inference_from_file.py

---
 demo/realtime_model_inference_from_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/realtime_model_inference_from_file.py b/demo/realtime_model_inference_from_file.py
index 023fbb9..3e2b804 100644
--- a/demo/realtime_model_inference_from_file.py
+++ b/demo/realtime_model_inference_from_file.py
@@ -180,7 +180,7 @@ def main():
         print("Error: No valid scripts found in the txt file")
         return
 
-    full_script = scripts.replace("'", "'").replace('"', '"').replace('"', '"')
+    full_script = scripts.replace("’", "'").replace('“', '"').replace('”', '"')
 
     print(f"Loading processor & model from {args.model_path}")
     processor = VibeVoiceStreamingProcessor.from_pretrained(args.model_path)

From c2a5bbfdd86265460ca0de0a4930cb5f61fe11e8 Mon Sep 17 00:00:00 2001
From: Maitri Savaliya <122001787+maitrisavaliya@users.noreply.github.com>
Date: Wed, 10 Dec 2025 22:02:42 +0530
Subject: [PATCH 9/9] Update vram_utils.py

---
 utils/vram_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/vram_utils.py b/utils/vram_utils.py
index 813516d..7fedd05 100644
--- a/utils/vram_utils.py
+++ b/utils/vram_utils.py
@@ -22,7 +22,7 @@ def get_available_vram_gb() -> float:
         # Get total and allocated memory
         total = torch.cuda.get_device_properties(device).total_memory
         allocated = torch.cuda.memory_allocated(device)
-        available = (total - allocated) / (1024 ** 3)  # Convert to GB
+        available = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert to GB
         return available
     except Exception as e:
         logger.warning(f"Could not detect VRAM: {e}")
@@ -84,4 +84,4 @@ def print_vram_info(available_vram_gb: float, model_name: str, quantization: str
         logger.warning(
             f"   Example: python demo/inference_from_file.py "
             f"--model_path {model_name} --quantization {suggested} ..."
-        )
\ No newline at end of file
+        )