adds WithTimestamp functions for regular and streaming versions #99

tomkail · 2024-11-02T10:38:17Z

Demo code here:

// Licensed under the MIT License. See LICENSE in the project root for license information.

using ElevenLabs.Models;
using ElevenLabs.Voices;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using ElevenLabs.TextToSpeech;
using UnityEngine;
using Utilities.Async;

namespace ElevenLabs.Demo
{
    [RequireComponent(typeof(AudioSource))]
    public class TextToSpeechDemo : MonoBehaviour
    {
        [SerializeField]
        private ElevenLabsConfiguration configuration;

        [SerializeField]
        private bool debug = true;

        [SerializeField]
        private Voice voice;

        [TextArea(3, 10)]
        [SerializeField]
        private string message;

        [SerializeField]
        private AudioSource audioSource;

        private readonly Queue<AudioClip> streamClipQueue = new();

#if !UNITY_2022_3_OR_NEWER
        private readonly CancellationTokenSource lifetimeCts = new();
        private CancellationToken destroyCancellationToken => lifetimeCts.Token;
#endif

        private TimestampedTranscriptCharacter[] currentTranscript;
        private int currentCharacterIndex;

        private void OnValidate()
        {
            if (audioSource == null)
            {
                audioSource = GetComponent<AudioSource>();
            }
        }

        private async void Start()
        {
            OnValidate();

            try
            {
                var api = new ElevenLabsClient(configuration)
                {
                    EnableDebug = debug
                };

                if (voice == null)
                {
                    voice = (await api.VoicesEndpoint.GetAllVoicesAsync(destroyCancellationToken)).FirstOrDefault();
                }

                streamClipQueue.Clear();
                var streamQueueCts = CancellationTokenSource.CreateLinkedTokenSource(destroyCancellationToken);
                PlayStreamQueue(streamQueueCts.Token);
                var request = new TextToSpeechRequest(voice, message, Encoding.UTF8, voice.Settings ?? await api.VoicesEndpoint.GetDefaultVoiceSettingsAsync(destroyCancellationToken), OutputFormat.PCM_24000, null, Model.TurboV2_5, null);
                // {
                //     var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(request, partialClip =>
                //     {
                //         streamClipQueue.Enqueue(partialClip);
                //     }, cancellationToken: destroyCancellationToken);
                //     audioSource.clip = voiceClip.AudioClip;
                //     await new WaitUntil(() => streamClipQueue.Count == 0 && !audioSource.isPlaying);
                //     streamQueueCts.Cancel();
                // if (debug)
                // {
                //     Debug.Log($"Full clip: {voiceClipAndTimestampedTranscriptCharacters.voiceClip.Id}");
                // }
                // }

                // {
                //     var voiceClipAndTimestampedTranscriptCharacters = await api.TextToSpeechEndpoint.TextToSpeechWithTimestampsAsync(request, cancellationToken: destroyCancellationToken);
                //     audioSource.clip = voiceClipAndTimestampedTranscriptCharacters.voiceClip.AudioClip;
                //     await new WaitUntil(() => streamClipQueue.Count == 0 && !audioSource.isPlaying);
                //     streamQueueCts.Cancel();
                //     if (debug)
                //     {
                //         Debug.Log($"Full clip: {voiceClipAndTimestampedTranscriptCharacters.voiceClip.Id}");
                //     }
                // }

                {
                    var voiceClipAndTimestampedTranscriptCharacters = await api.TextToSpeechEndpoint.StreamTextToSpeechWithTimestampsAsync(request, partialClipAndCharacters =>
                    {
                        // streamClipQueue.Enqueue(partialClipAndCharacters.audioClip);
                    }, cancellationToken: destroyCancellationToken);

                    audioSource.clip = voiceClipAndTimestampedTranscriptCharacters.voiceClip.AudioClip;
                    audioSource.Play();
                    currentTranscript = voiceClipAndTimestampedTranscriptCharacters.timestampedTranscriptCharacters;
                    currentCharacterIndex = 0;

                    // await new WaitUntil(() => streamClipQueue.Count == 0 && !audioSource.isPlaying);
                    streamQueueCts.Cancel();
                    if (debug)
                    {
                        Debug.Log($"Full clip: {voiceClipAndTimestampedTranscriptCharacters.voiceClip.Id}");
                    }
                }

            }
            catch (Exception e)
            {
                Debug.LogError(e);
            }
        }

#if !UNITY_2022_3_OR_NEWER
        private void OnDestroy()
        {
            lifetimeCts.Cancel();
            lifetimeCts.Dispose();
        }
#endif

        private async void PlayStreamQueue(CancellationToken cancellationToken)
        {
            try
            {
                await new WaitUntil(() => streamClipQueue.Count > 0);
                var endOfFrame = new WaitForEndOfFrame();

                do
                {
                    if (!audioSource.isPlaying &&
                        streamClipQueue.TryDequeue(out var clip))
                    {
                        Debug.Log($"playing partial clip: {clip.name}");
                        audioSource.PlayOneShot(clip);
                    }

                    await endOfFrame;
                } while (!cancellationToken.IsCancellationRequested);
            }
            catch (Exception e)
            {
                Debug.LogError(e);
            }
        }

        private void Update()
        {
            if (audioSource.isPlaying && currentTranscript != null && currentTranscript.Length > 0)
            {
                float currentTime = audioSource.time;

                // Find the current character being spoken
                while (currentCharacterIndex < currentTranscript.Length &&
                       currentTime >= currentTranscript[currentCharacterIndex].EndTime)
                {
                    currentCharacterIndex++;
                }

                while (currentCharacterIndex > 0 &&
                       currentTime < currentTranscript[currentCharacterIndex - 1].EndTime)
                {
                    currentCharacterIndex--;
                }

                if (currentCharacterIndex < currentTranscript.Length)
                {
                    var character = currentTranscript[currentCharacterIndex];
                    if (currentTime >= character.StartTime && currentTime <= character.EndTime)
                    {
                        if (debug)
                        {
                            Debug.Log($"Current character: '{character.Character}' at time {currentTime:F2}s " +
                                    $"(Start: {character.StartTime:F2}s, End: {character.EndTime:F2}s)");
                        }
                    }
                }
            }
        }
    }
}

StephenHodgson · 2024-11-02T18:15:33Z

ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Models/TimestampedTranscriptCharacter.cs

+    public readonly struct TimestampedTranscriptCharacter
+    {
+        /// <summary>
+        /// The character being spoken
+        /// </summary>
+        public readonly string Character;


structs should not have strings. It is ok to make this a class.

StephenHodgson · 2024-11-02T18:16:24Z

ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs

+        /// </param>
+        /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
+        /// <returns>A tuple containing the <see cref="VoiceClip"/> and an array of <see cref="TimestampedTranscriptCharacter"/>.</returns>
+        public async Task<(VoiceClip voiceClip, TimestampedTranscriptCharacter[] timestampedTranscriptCharacters)> TextToSpeechWithTimestampsAsync(string text, Voice voice, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, CancellationToken cancellationToken = default)


for a public API I prefer not to return anon Tuples in this way. I'd prefer to create a class definition for the signature, to make breaking changes easier to deal with in future.

StephenHodgson · 2024-11-02T18:17:03Z

ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs

+        /// <returns>Downloaded clip path, and the loaded audio clip.</returns>
+        public async Task<(VoiceClip voiceClip, TimestampedTranscriptCharacter[] timestampedTranscriptCharacters)> StreamTextToSpeechWithTimestampsAsync(TextToSpeechRequest request, Action<(AudioClip audioClip, TimestampedTranscriptCharacter[] timestampedTranscriptCharacters)> partialClipCallback, CancellationToken cancellationToken = default)
+        {
+            var frequency = request.OutputFormat switch


Seems like a lot of duplicate code, maybe we can organize this a bit more to reduce duplicates?

StephenHodgson

see comments

adds WithTimestamp functions for regular and streaming versions

9b7adf6

tomkail requested a review from StephenHodgson as a code owner November 2, 2024 10:38

StephenHodgson changed the base branch from main to development November 2, 2024 18:10

Merge branch 'development' into with_timestamps

0fa6526

StephenHodgson reviewed Nov 2, 2024

View reviewed changes

StephenHodgson requested changes Nov 2, 2024

View reviewed changes

tomkail closed this Nov 3, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

adds WithTimestamp functions for regular and streaming versions #99

adds WithTimestamp functions for regular and streaming versions #99

tomkail commented Nov 2, 2024 •

edited by StephenHodgson

Loading

StephenHodgson Nov 2, 2024

StephenHodgson Nov 2, 2024

StephenHodgson Nov 2, 2024 •

edited

Loading

StephenHodgson left a comment

adds WithTimestamp functions for regular and streaming versions #99

adds WithTimestamp functions for regular and streaming versions #99

Conversation

tomkail commented Nov 2, 2024 • edited by StephenHodgson Loading

StephenHodgson Nov 2, 2024

Choose a reason for hiding this comment

StephenHodgson Nov 2, 2024

Choose a reason for hiding this comment

StephenHodgson Nov 2, 2024 • edited Loading

Choose a reason for hiding this comment

StephenHodgson left a comment

Choose a reason for hiding this comment

tomkail commented Nov 2, 2024 •

edited by StephenHodgson

Loading

StephenHodgson Nov 2, 2024 •

edited

Loading