diff --git a/demo/list_voices.py b/demo/list_voices.py new file mode 100755 index 0000000..25e058b --- /dev/null +++ b/demo/list_voices.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +""" +VibeVoice Voice Listing Utility + +This script lists all available voice presets for VibeVoice models, +including metadata like language and gender parsed from filenames. + +Usage: + python demo/list_voices.py [--format {table|json|simple}] [--lang LANG] + +Examples: + # List all voices in table format (default) + python demo/list_voices.py + + # List only English voices + python demo/list_voices.py --lang en + + # Output as JSON for programmatic use + python demo/list_voices.py --format json + + # Simple list of voice names only + python demo/list_voices.py --format simple +""" + +import argparse +import json +import os +from pathlib import Path +from typing import Dict, List, Optional + + +class VoiceInfo: + """Represents metadata about a voice preset""" + + def __init__(self, filename: str, path: Path): + self.filename = filename + self.path = path + self.name = filename # Full name without extension + + # Parse language code and speaker info from filename + # Expected format: {lang}-{SpeakerName}_{gender}.pt + # Examples: en-Carter_man.pt, de-Spk0_woman.pt + parts = filename.split('-', 1) + + if len(parts) == 2: + self.language = parts[0] + speaker_part = parts[1] + + # Extract speaker name and gender + if '_' in speaker_part: + self.speaker_name, self.gender = speaker_part.rsplit('_', 1) + else: + self.speaker_name = speaker_part + self.gender = 'unknown' + else: + # Fallback for non-standard naming + self.language = 'unknown' + self.speaker_name = filename + self.gender = 'unknown' + + # Get file size + try: + self.size_mb = self.path.stat().st_size / (1024 * 1024) + except Exception: + self.size_mb = 0.0 + + def to_dict(self) -> Dict: + """Convert to dictionary for JSON serialization""" + return { + 'name': self.name, + 'language': self.language, + 'speaker': self.speaker_name, + 'gender': self.gender, + 'size_mb': round(self.size_mb, 2), + 'path': str(self.path), + } + + @staticmethod + def get_language_name(code: str) -> str: + """Convert language code to full name""" + lang_map = { + 'en': 'English', + 'de': 'German', + 'fr': 'French', + 'it': 'Italian', + 'jp': 'Japanese', + 'kr': 'Korean', + 'nl': 'Dutch', + 'pl': 'Polish', + 'pt': 'Portuguese', + 'sp': 'Spanish', + 'in': 'International', + } + return lang_map.get(code, code.upper()) + + +class VoiceManager: + """Manages voice presets and provides listing functionality""" + + def __init__(self, voices_dir: Optional[Path] = None): + if voices_dir is None: + # Default to demo/voices/streaming_model + script_dir = Path(__file__).parent + voices_dir = script_dir / "voices" / "streaming_model" + + self.voices_dir = Path(voices_dir) + self.voices: List[VoiceInfo] = [] + self._load_voices() + + def _load_voices(self): + """Load all voice presets from the voices directory""" + if not self.voices_dir.exists(): + print(f"Warning: Voices directory not found at {self.voices_dir}") + return + + # Find all .pt files + for pt_file in sorted(self.voices_dir.glob("*.pt")): + voice_info = VoiceInfo(pt_file.stem, pt_file) + self.voices.append(voice_info) + + def filter_by_language(self, lang_code: str) -> List[VoiceInfo]: + """Filter voices by language code""" + return [v for v in self.voices if v.language.lower() == lang_code.lower()] + + def get_by_name(self, name: str) -> Optional[VoiceInfo]: + """Get a specific voice by name""" + for voice in self.voices: + if voice.name == name or voice.speaker_name == name: + return voice + return None + + def print_table(self, voices: Optional[List[VoiceInfo]] = None): + """Print voices in a formatted table""" + if voices is None: + voices = self.voices + + if not voices: + print("No voices found.") + return + + # Calculate column widths + name_width = max(len(v.name) for v in voices) + 2 + speaker_width = max(len(v.speaker_name) for v in voices) + 2 + lang_width = max(len(VoiceInfo.get_language_name(v.language)) for v in voices) + 2 + + # Ensure minimum widths + name_width = max(name_width, 20) + speaker_width = max(speaker_width, 15) + lang_width = max(lang_width, 12) + + # Print header + print(f"\n{'Name':<{name_width}} {'Speaker':<{speaker_width}} {'Language':<{lang_width}} {'Gender':<8} {'Size (MB)':<10}") + print("=" * (name_width + speaker_width + lang_width + 28)) + + # Print voices + for voice in voices: + lang_name = VoiceInfo.get_language_name(voice.language) + print(f"{voice.name:<{name_width}} {voice.speaker_name:<{speaker_width}} {lang_name:<{lang_width}} {voice.gender:<8} {voice.size_mb:>8.2f}") + + print(f"\nTotal: {len(voices)} voice(s)") + + def print_simple(self, voices: Optional[List[VoiceInfo]] = None): + """Print simple list of voice names""" + if voices is None: + voices = self.voices + + for voice in voices: + print(voice.name) + + def to_json(self, voices: Optional[List[VoiceInfo]] = None) -> str: + """Convert voices to JSON format""" + if voices is None: + voices = self.voices + + data = { + 'total': len(voices), + 'voices_directory': str(self.voices_dir), + 'voices': [v.to_dict() for v in voices] + } + return json.dumps(data, indent=2) + + def get_statistics(self) -> Dict: + """Get statistics about available voices""" + if not self.voices: + return { + 'total': 0, + 'by_language': {}, + 'by_gender': {}, + } + + # Count by language + by_lang = {} + for voice in self.voices: + lang = voice.language + by_lang[lang] = by_lang.get(lang, 0) + 1 + + # Count by gender + by_gender = {} + for voice in self.voices: + gender = voice.gender + by_gender[gender] = by_gender.get(gender, 0) + 1 + + return { + 'total': len(self.voices), + 'by_language': by_lang, + 'by_gender': by_gender, + } + + +def main(): + parser = argparse.ArgumentParser( + description="List available VibeVoice voice presets", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + parser.add_argument( + '--format', + choices=['table', 'json', 'simple'], + default='table', + help='Output format (default: table)' + ) + parser.add_argument( + '--lang', + type=str, + help='Filter by language code (e.g., en, de, fr)' + ) + parser.add_argument( + '--voices-dir', + type=Path, + help='Path to voices directory (default: demo/voices/streaming_model)' + ) + parser.add_argument( + '--stats', + action='store_true', + help='Show statistics about available voices' + ) + + args = parser.parse_args() + + # Initialize voice manager + manager = VoiceManager(voices_dir=args.voices_dir) + + # Filter by language if specified + voices = manager.voices + if args.lang: + voices = manager.filter_by_language(args.lang) + if not voices: + print(f"No voices found for language: {args.lang}") + return + + # Show statistics if requested + if args.stats: + stats = manager.get_statistics() + print("\n=== Voice Statistics ===") + print(f"Total voices: {stats['total']}") + print("\nBy Language:") + for lang, count in sorted(stats['by_language'].items()): + lang_name = VoiceInfo.get_language_name(lang) + print(f" {lang_name} ({lang}): {count}") + print("\nBy Gender:") + for gender, count in sorted(stats['by_gender'].items()): + print(f" {gender.capitalize()}: {count}") + print() + return + + # Display in requested format + if args.format == 'table': + manager.print_table(voices) + elif args.format == 'json': + print(manager.to_json(voices)) + elif args.format == 'simple': + manager.print_simple(voices) + + +if __name__ == '__main__': + main() diff --git a/demo/vibevoice_realtime_colab.ipynb b/demo/vibevoice_realtime_colab.ipynb index 2f61407..bdacd8c 100644 --- a/demo/vibevoice_realtime_colab.ipynb +++ b/demo/vibevoice_realtime_colab.ipynb @@ -2,13 +2,12 @@ "cells": [ { "cell_type": "markdown", - "id": "d1785adb", "metadata": { - "colab_type": "text", - "id": "view-in-github" + "id": "view-in-github", + "colab_type": "text" }, "source": [ - "\"Open" + "\"Open" ] }, { @@ -34,12 +33,211 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4wxJ6QHM-ZOb", "metadata": { - "id": "4wxJ6QHM-ZOb" + "id": "4wxJ6QHM-ZOb", + "outputId": "a9f42189-191e-484a-8cfb-5a272fce5b3e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310, + "referenced_widgets": [ + "6aa0514fe8e74f87a8d25e2a3ec7b249", + "2e77995f5e9b4c9d93a1e50482e63464", + "5bddb298cd604fbba0776657ef807664", + "398d6a88664b44c89b06610a1b5eb33e", + "23e9f1b01aa6454bb8f63b74aaa333f5", + "411a5e8e282b4257b561655c637836ab", + "a85f891d3eb946fa94af2eff7024bc76", + "f112df2ccc7b4861a27f5336d38ac86f", + "24d904f0c7f94a749af6d440b475bdeb", + "5e994c7530c64e8ab571751a12684217", + "52eb275fb2ef41e1ba008baef9378dfe", + "0d8d3ac2411048a7b846835d60e4f23d", + "bc3d23550b01450e94574975277bf5a2", + "0df051569bd540b7a19e3aefdc69f7cd", + "b4351182f3b7462cbf915b26863cf4de", + "dae8d2fe62bd41ea9d99c9ccc817de46", + "e35dd504906745b28846c07a644c3423", + "8171a7d8dcb9432881aa56fb52743e3d", + "2058d4fb6b1745a2a971491fcc747244", + "c3311a8331244551843472284c58f60f", + "ddd6507456c9496db1525c87994c5e4a", + "be663377d2864ba3a4c6832ed9d77a3d", + "06dd326664c2487f8195c15c56326273", + "45b8402a7ce240c5b3e84520af47e6e3", + "f6374d57d06f4234ba5b29acbbec7c8d", + "cde3a6c088e545cf94e2ccc73655eff2", + "60723c32638d47dd81281d31ddc5eb66", + "b79622d7dc0e46e28ebd17d730bfe91d", + "027f223a86a249dd88b1010dc85f679f", + "7bf175a6594a4417899953186dfa8a97", + "620f2c3ccaac4028bd2668a12dc57749", + "1186b85637f2496ba1366ab4d2d2a11d", + "6e0d9876d9f94fca889a959b09319474", + "422ea9b78c584d1ca555728cd01e861b", + "063ce9a58fc748e7929ed70a72c1b287", + "75ebd541b72941b3935a96da96615293", + "7c6ad8b88a7c49ca980f14bb03fd1463", + "eb94263072dc4ab78a2f125840a5dda5", + "81ae6a64fbee460ab6c15b5cc0448474", + "3607d7c97912499191b8b632f29bc31d", + "9006110b3b8043dfbe85d1c485242834", + "4ca65f86d3a344eeb43f1e10ce2bade7", + "8fee63d65695482f9c68294816a98a6b", + "73e3840872fe4245aa88c315fe266214", + "f133d8b35d8f461d9e3b0c099d56dc98", + "c6c3ec7e97554e3f930cdcab2e3002fa", + "73a5e9fc2808494485db6768fdb6ac6e", + "d30457cbc6fa4a54902bc92640af82ce", + "a9c4e87ff31e458187e09e93b23bcaed", + "0af92bfff186463ab480139f415b3f45", + "2974cc410d6e40d89cdb0a600943d70e", + "f48cf6761d6d493cb76c0030249baa11", + "3ea8f4b8b4e54b99a48d410fe9659f96", + "4a7829162a704c62b40823c359c5c86e", + "c23a9a4b3ab044d6af40f293d7249733", + "50ac88eac535450a8f249f52092d2406", + "9ab10fe6f69d40de8a90d924288611bd", + "94a9a23ebefa45ad9604bcf353844337", + "e687b3ff73cb4b42964d46ade98a2b65", + "c705ff6baaf5438b93dbc4107b1c4305", + "306e3356465c4898a5f2b5ccb29aaa7c", + "9ec13363e1f046368343d27f41b08f19", + "1350cc445a004aff895cab877697cc3c", + "48bc89fef2724841b2f182cc6eed35e0", + "68ffa713da0a4b01b2c42e0c8a27835e", + "a8e4a36558e04810b203b32cdd298ffb", + "5b08b3a3aa6b4c0fb19167850be8453d", + "884ec6a31fdf410e92c5ea89c0e1419b", + "3e99d2ab170c45fe844cb81fed3ee1d7", + "4e2e16fb3c5e41429a3660f65c488d8f", + "058a4595e6c042bfa0df510046f1109d", + "5c2b36b95d60431a8bbf553e408f4f51", + "a695abbe11304923bb22390b6b8d7af2", + "47885c9ca7c943d18b0f6529ca6a8238", + "772eb2290aef43e4947878efd7794ce7", + "53f6ef4e80384e8aa06628a35c354e8f", + "83077e8e24ef484ab79eb4eefd441c80" + ] + } }, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✅ T4 GPU detected\n", + "✅ Cloned VibeVoice repository\n", + "✅ Installed dependencies\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Fetching 6 files: 0%| | 0/6 [00:00
Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. " + } + }, + "afaace7433ee4538844cd73480589dda": { + "model_module": "@jupyter-widgets/controls", + "model_name": "PasswordModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_fa49258b8fe3441a82c22e9b60a955df", + "placeholder": "​", + "style": "IPY_MODEL_40db57ec16eb4e4ba4cd506abaf4069b", + "value": "" + } + }, + "70fadafd4db04a0783e329362ba8b87c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "Add token as git credential?", + "description_tooltip": null, + "disabled": false, + "indent": true, + "layout": "IPY_MODEL_b17a44ac600746f6a1825cbb0cc063be", + "style": "IPY_MODEL_683c557013424a18a85422835b75a8e3", + "value": true + } + }, + "dfcc460d6f424736be2fb25530b09185": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_2e5bc5e6f4ef48b0b00424049cec8a09", + "style": "IPY_MODEL_b13735cb5c4249a08d8c7cfa654ba106", + "tooltip": "" + } + }, + "c004c599ecc943e18ee873d55ee630ea": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_161a12ab57aa41638a846a2238bf9642", + "placeholder": "​", + "style": "IPY_MODEL_0332ed76bfde46c49c9879f36e75f6f3", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "8f96189e4dd146679c62ae79e1ecc4b1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "687266971acc4ab1b03813bc8a3b8e76": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4bde1aea8d194644b4e9021f09503800": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fa49258b8fe3441a82c22e9b60a955df": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "40db57ec16eb4e4ba4cd506abaf4069b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b17a44ac600746f6a1825cbb0cc063be": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "683c557013424a18a85422835b75a8e3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2e5bc5e6f4ef48b0b00424049cec8a09": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b13735cb5c4249a08d8c7cfa654ba106": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "161a12ab57aa41638a846a2238bf9642": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0332ed76bfde46c49c9879f36e75f6f3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2d2751bc81ca4bd5bef927b23a99ee93": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8deb0245c1164c0192ac324a38f32971", + "placeholder": "​", + "style": "IPY_MODEL_054055b8c233424d82c435f0d453fa56", + "value": "Connecting..." + } + }, + "8deb0245c1164c0192ac324a38f32971": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "054055b8c233424d82c435f0d453fa56": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a6612ce289c246668325ec1c50b01a3e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c85a7873e70f4345a071cbe453bbe801", + "IPY_MODEL_045a67c7b0cf467b8066370dd9e7ffef", + "IPY_MODEL_8c84ea605447464f84e6b51f0790461b" + ], + "layout": "IPY_MODEL_9486fe93c26c4ab18b26c6c06d87c9f0" + } + }, + "c85a7873e70f4345a071cbe453bbe801": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_405497428efc4015932fc2e76ceb110f", + "placeholder": "​", + "style": "IPY_MODEL_a8d7ef0c5f18419983e1c3077deb4154", + "value": "Fetching 6 files: 100%" + } + }, + "045a67c7b0cf467b8066370dd9e7ffef": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_04a4242afe6c4fb19aed5282d9110df2", + "max": 6, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_27adab0a48c24c3ba388c7ed0cbc8073", + "value": 6 + } + }, + "8c84ea605447464f84e6b51f0790461b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8987b61805774e3fbc660b1ddf477fff", + "placeholder": "​", + "style": "IPY_MODEL_9f543c4a97314ebd836f0f8ce3f525cb", + "value": " 6/6 [00:00<00:00, 391.95it/s]" + } + }, + "9486fe93c26c4ab18b26c6c06d87c9f0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "405497428efc4015932fc2e76ceb110f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a8d7ef0c5f18419983e1c3077deb4154": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "04a4242afe6c4fb19aed5282d9110df2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "27adab0a48c24c3ba388c7ed0cbc8073": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8987b61805774e3fbc660b1ddf477fff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9f543c4a97314ebd836f0f8ce3f525cb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } } }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/vibevoice-realtime-0.5b.md b/docs/vibevoice-realtime-0.5b.md index dfb2f25..a590819 100644 --- a/docs/vibevoice-realtime-0.5b.md +++ b/docs/vibevoice-realtime-0.5b.md @@ -121,11 +121,29 @@ Tip: Just try it on [Colab](https://colab.research.google.com/github/microsoft/V python demo/realtime_model_inference_from_file.py --model_path microsoft/VibeVoice-Realtime-0.5B --txt_path demo/text_examples/1p_vibevoice.txt --speaker_name Carter ``` -### [Optional] More experimental voices -Download additional experimental multi-lingual speakers before launching demo or inference from files. +### Usage 3: List available voices +A voice listing utility is available to help you discover and browse all available voice presets: + ```bash -bash demo/download_experimental_voices.sh +# List all voices in table format (default) +python demo/list_voices.py + +# List only English voices +python demo/list_voices.py --lang en + +# Show statistics about available voices +python demo/list_voices.py --stats + +# Output as JSON for programmatic use +python demo/list_voices.py --format json + +# Simple list of voice names only +python demo/list_voices.py --format simple ``` + +The utility provides detailed information including speaker name, language, gender, and file size for each available voice preset. + + ## Risks and limitations While efforts have been made to optimize it through various techniques, it may still produce outputs that are unexpected, biased, or inaccurate. VibeVoice inherits any biases, errors, or omissions produced by its base model (specifically, Qwen2.5 0.5b in this release).