From 31ff109b2a33f1bda3c66d9ec2c7d9895cddd3eb Mon Sep 17 00:00:00 2001
From: Stephen Hodgson <rage.against.the.pixel@gmail.com>
Date: Sat, 8 Mar 2025 18:16:01 -0500
Subject: [PATCH 1/2] com.rest.elevenlabs 3.5.1

- Fixed generated clip playback for non-streaming clips
- Updated usages of Task.Delay with WebGL friendly Awaiters.DelayAsync
- Updated TextToSpeechDemo
- Updated Deps
- Updated Unit Tests
---
 .gitignore                                    |  1 +
 .../Runtime/Common/GeneratedClip.cs           | 61 ++++++++++---------
 .../Runtime/Dubbing/DubbingEndpoint.cs        |  3 +-
 .../TextToSpeech/TextToSpeechEndpoint.cs      |  2 +-
 .../Samples~/TextToSpeech/TextToSpeechDemo.cs | 18 +++++-
 .../Test_Fixture_04_TextToSpeechEndpoint.cs   | 15 +++--
 .../Packages/com.rest.elevenlabs/package.json |  9 ++-
 ElevenLabs/Packages/manifest.json             |  2 +-
 8 files changed, 68 insertions(+), 43 deletions(-)
diff --git a/.gitignore b/.gitignore
index bd4dcd4..b2653a0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,7 @@ TextMesh Pro/
 UIElementsSchema/
 *packages-lock.json
 ProjectSettings/SceneTemplateSettings.json
+boot.config
 
 # ============ #
 # Certificates #
diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs
index 6b8e1f9..4ea450f 100644
--- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs
+++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs
@@ -57,62 +57,61 @@ internal GeneratedClip(string id, string text, ReadOnlyMemory<byte> clipData, in
         public Guid TextHash { get; private set; }
 
         [SerializeField]
-        private AudioClip audioClip;
+        private string cachedPath;
 
         [Preserve]
-        public AudioClip AudioClip
+        public string CachedPath => cachedPath;
+
+        public ReadOnlyMemory<byte> ClipData { get; }
+
+        public float[] ClipSamples
         {
             get
             {
-                if (audioClip == null && !ClipData.IsEmpty)
+                if (clipSamples != null)
                 {
-                    var samples = ClipSamples;
-
-                    if (samples is { Length: > 0 })
-                    {
-                        audioClip = AudioClip.Create(Id, samples.Length, 1, SampleRate, false);
-                        audioClip.SetData(samples, 0);
-                    }
+                    return clipSamples;
                 }
 
-                if (audioClip == null)
+                if (ClipData.IsEmpty)
                 {
-                    Debug.LogError($"{nameof(audioClip)} is null, try loading it with LoadCachedAudioClipAsync");
+                    return Array.Empty<float>();
                 }
 
-                return audioClip;
+                clipSamples ??= PCMEncoder.Decode(ClipData.ToArray(), inputSampleRate: SampleRate, outputSampleRate: AudioSettings.outputSampleRate);
+                return clipSamples;
+
             }
         }
+        private float[] clipSamples;
+
+        public int SampleRate { get; }
 
         [SerializeField]
-        private string cachedPath;
+        private AudioClip audioClip;
 
         [Preserve]
-        public string CachedPath => cachedPath;
-
-        public ReadOnlyMemory<byte> ClipData { get; }
-
-        private float[] clipSamples;
-
-        public float[] ClipSamples
+        public AudioClip AudioClip
         {
             get
             {
-                if (!ClipData.IsEmpty)
+                if (audioClip == null &&
+                    ClipSamples is { Length: > 0 })
                 {
-                    clipSamples ??= PCMEncoder.Decode(ClipData.ToArray(), PCMFormatSize.SixteenBit, SampleRate, AudioSettings.outputSampleRate);
+                    audioClip = AudioClip.Create(Id, ClipSamples.Length, 1, AudioSettings.outputSampleRate, false);
+                    audioClip.SetData(ClipSamples, 0);
                 }
-                else if (audioClip != null)
+
+                if (audioClip == null)
                 {
-                    clipSamples = new float[audioClip.samples];
-                    audioClip.GetData(clipSamples, 0);
+                    Debug.LogError($"{nameof(audioClip)} is null, try loading it with LoadCachedAudioClipAsync");
                 }
 
-                return clipSamples;
+                return audioClip;
             }
         }
 
-        public int SampleRate { get; }
+        public float Length => ClipSamples.Length / (float)AudioSettings.outputSampleRate;
 
         public void OnBeforeSerialize() => textHash = TextHash.ToString();
 
@@ -130,6 +129,12 @@ var path when path.EndsWith(".mp3") => AudioType.MPEG,
                 _ => AudioType.UNKNOWN
             };
 
+            if (audioType == AudioType.UNKNOWN)
+            {
+                Debug.LogWarning($"Unable to load cached audio clip at {cachedPath}");
+                return null;
+            }
+
             return await Rest.DownloadAudioClipAsync($"file://{cachedPath}", audioType, cancellationToken: cancellationToken);
         }
     }
diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs
index cc4883e..3bae0e0 100644
--- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs
+++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs
@@ -10,6 +10,7 @@
 using System.Threading;
 using System.Threading.Tasks;
 using UnityEngine;
+using Utilities.Async;
 using Utilities.WebRequestRest;
 using Debug = UnityEngine.Debug;
 
@@ -148,7 +149,7 @@ private async Task<DubbingProjectMetadata> WaitForDubbingCompletionAsync(Dubbing
                         Debug.Log($"Dubbing for {dubbingResponse.DubbingId} in progress... Will check status again in {pollingInterval.TotalSeconds} seconds.");
                     }
 
-                    await Task.Delay(pollingInterval, cancellationToken).ConfigureAwait(false);
+                    await Awaiters.DelayAsync(pollingInterval, cancellationToken).ConfigureAwait(true);
                 }
                 else
                 {
diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs
index cf6acb5..9eafcc4 100644
--- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs
+++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs
@@ -180,7 +180,7 @@ public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, Func
             var audioData = request.WithTimestamps ? accumulatedPCMData!.ToArray() : response.Data;
             var cachedPath = await SaveAudioToCache(audioData, clipId, request.Voice, request.OutputFormat, request.CacheFormat, cancellationToken).ConfigureAwait(true);
 
-            return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory<byte>(audioData), request.OutputFormat.GetSampleRate(), cachedPath)
+            return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory<byte>(audioData), frequency, cachedPath)
             {
                 TimestampedTranscriptCharacters = accumulatedTranscriptData?.ToArray() ?? Array.Empty<TimestampedTranscriptCharacter>()
             };
diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs
index 15eb60d..7548c23 100644
--- a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs
+++ b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs
@@ -4,12 +4,14 @@
 using ElevenLabs.TextToSpeech;
 using ElevenLabs.Voices;
 using System;
+using System.Diagnostics;
 using System.Linq;
 using System.Threading;
 using System.Threading.Tasks;
 using UnityEngine;
 using Utilities.Async;
 using Utilities.Audio;
+using Debug = UnityEngine.Debug;
 
 namespace ElevenLabs.Demo
 {
@@ -63,18 +65,30 @@ private async void Start()
                 }
 
                 var request = new TextToSpeechRequest(voice, message, model: Model.FlashV2_5, outputFormat: OutputFormat.PCM_24000);
+                var stopwatch = Stopwatch.StartNew();
                 var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(request, async partialClip =>
                 {
                     await streamAudioSource.BufferCallbackAsync(partialClip.ClipSamples);
                 }, cancellationToken: destroyCancellationToken);
-                await new WaitUntil(() => streamAudioSource.IsEmpty || destroyCancellationToken.IsCancellationRequested);
-                destroyCancellationToken.ThrowIfCancellationRequested();
+                var elapsedTime = (float)stopwatch.Elapsed.TotalSeconds;
+                var playbackTime = voiceClip.Length - elapsedTime;
+
+                if (debug)
+                {
+                    Debug.Log($"Elapsed time: {elapsedTime:F} seconds");
+                    Debug.Log($"voice clip length: {voiceClip.Length:F} seconds");
+                    Debug.Log($"playback time: {playbackTime:F} seconds");
+                }
+
+                await Awaiters.DelayAsync(TimeSpan.FromSeconds(playbackTime + 0.1f), destroyCancellationToken);
                 ((AudioSource)streamAudioSource).clip = voiceClip.AudioClip;
 
                 if (debug)
                 {
                     Debug.Log($"Full clip: {voiceClip.Id}");
                 }
+
+                ((AudioSource)streamAudioSource).Play();
             }
             catch (Exception e)
             {
diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs
index 8865178..f10a81c 100644
--- a/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs
+++ b/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs
@@ -21,6 +21,7 @@ public async Task Test_01_TextToSpeech()
             var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request);
             Assert.NotNull(voiceClip);
             Assert.NotNull(voiceClip.AudioClip);
+            Assert.AreEqual(voiceClip.AudioClip.length, voiceClip.Length, 0.01);
             Debug.Log(voiceClip.Id);
         }
 
@@ -36,9 +37,10 @@ public async Task Test_02_StreamTextToSpeech()
             Assert.NotNull(partialClips);
             Assert.IsNotEmpty(partialClips);
             Assert.NotNull(voiceClip);
-            Assert.IsNotNull(voiceClip.AudioClip);
             Debug.Log(voiceClip.Id);
             Debug.Log(voiceClip.CachedPath);
+            Assert.IsNotNull(voiceClip.AudioClip);
+            Assert.AreEqual(voiceClip.AudioClip.length, voiceClip.Length, 0.01);
         }
 
         [Test]
@@ -50,9 +52,10 @@ public async Task Test_03_TextToSpeech_Transcription()
             var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", withTimestamps: true);
             var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request);
             Assert.NotNull(voiceClip);
-            Assert.NotNull(voiceClip.AudioClip);
             Debug.Log(voiceClip.Id);
             Debug.Log(voiceClip.CachedPath);
+            Assert.NotNull(voiceClip.AudioClip);
+            Assert.AreEqual(voiceClip.AudioClip.length, voiceClip.Length, 0.01);
             Assert.NotNull(voiceClip.TimestampedTranscriptCharacters);
             Assert.IsNotEmpty(voiceClip.TimestampedTranscriptCharacters);
             Debug.Log("| Character | Start Time | End Time |");
@@ -88,9 +91,10 @@ public async Task Test_04_StreamTextToSpeech_Transcription()
             Assert.NotNull(partialClips);
             Assert.IsNotEmpty(partialClips);
             Assert.NotNull(voiceClip);
-            Assert.IsNotNull(voiceClip.AudioClip);
             Debug.Log(voiceClip.Id);
             Debug.Log(voiceClip.CachedPath);
+            Assert.IsNotNull(voiceClip.AudioClip);
+            Assert.AreEqual(voiceClip.AudioClip.length, voiceClip.Length, 0.01);
             Assert.AreEqual(characters.ToArray(), voiceClip.TimestampedTranscriptCharacters);
         }
 
@@ -111,10 +115,11 @@ public async Task Test_05_LanguageEnforced_TextToSpeech()
                 languageCode: "cs");
             var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request);
             Assert.NotNull(voiceClip);
-            Assert.NotNull(voiceClip.AudioClip);
-            Assert.IsTrue(string.IsNullOrWhiteSpace(voiceClip.CachedPath));
             Debug.Log(voiceClip.Id);
             Debug.Log(voiceClip.CachedPath);
+            Assert.NotNull(voiceClip.AudioClip);
+            Assert.AreEqual(voiceClip.AudioClip.length, voiceClip.Length, 0.01);
+            Assert.IsTrue(string.IsNullOrWhiteSpace(voiceClip.CachedPath));
         }
     }
 }
diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/package.json b/ElevenLabs/Packages/com.rest.elevenlabs/package.json
index 85496d7..8d411e4 100644
--- a/ElevenLabs/Packages/com.rest.elevenlabs/package.json
+++ b/ElevenLabs/Packages/com.rest.elevenlabs/package.json
@@ -3,7 +3,7 @@
   "displayName": "ElevenLabs",
   "description": "A non-official Eleven Labs voice synthesis RESTful client.",
   "keywords": [],
-  "version": "3.5.0",
+  "version": "3.5.1",
   "unity": "2021.3",
   "documentationUrl": "https://github.com/RageAgainstThePixel/com.rest.elevenlabs#documentation",
   "changelogUrl": "https://github.com/RageAgainstThePixel/com.rest.elevenlabs/releases",
@@ -17,10 +17,9 @@
     "url": "https://github.com/StephenHodgson"
   },
   "dependencies": {
-    "com.utilities.rest": "3.3.1",
-    "com.utilities.audio": "2.2.1",
-    "com.utilities.encoder.ogg": "4.2.0",
-    "com.utilities.encoder.wav": "2.2.0"
+    "com.utilities.rest": "3.3.2",
+    "com.utilities.encoder.ogg": "4.2.1",
+    "com.utilities.encoder.wav": "2.2.1"
   },
   "samples": [
     {
diff --git a/ElevenLabs/Packages/manifest.json b/ElevenLabs/Packages/manifest.json
index 8a73013..8d7c4fd 100644
--- a/ElevenLabs/Packages/manifest.json
+++ b/ElevenLabs/Packages/manifest.json
@@ -3,7 +3,7 @@
     "com.unity.ide.rider": "3.0.34",
     "com.unity.ide.visualstudio": "2.0.22",
     "com.unity.test-framework": "1.3.5",
-    "com.utilities.buildpipeline": "1.6.0"
+    "com.utilities.buildpipeline": "1.6.1"
   },
   "scopedRegistries": [
     {

From e22f2ef387d380a8648b250f5b5f835881b8f3df Mon Sep 17 00:00:00 2001
From: Stephen Hodgson <rage.against.the.pixel@gmail.com>
Date: Sat, 8 Mar 2025 18:25:50 -0500
Subject: [PATCH 2/2] play as oneshot

---
 .../Samples~/TextToSpeech/TextToSpeechDemo.cs                | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs
index 7548c23..9482734 100644
--- a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs
+++ b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs
@@ -80,15 +80,14 @@ private async void Start()
                     Debug.Log($"playback time: {playbackTime:F} seconds");
                 }
 
-                await Awaiters.DelayAsync(TimeSpan.FromSeconds(playbackTime + 0.1f), destroyCancellationToken);
-                ((AudioSource)streamAudioSource).clip = voiceClip.AudioClip;
+                await Awaiters.DelayAsync(TimeSpan.FromSeconds(playbackTime + 1f), destroyCancellationToken);
 
                 if (debug)
                 {
                     Debug.Log($"Full clip: {voiceClip.Id}");
                 }
 
-                ((AudioSource)streamAudioSource).Play();
+                ((AudioSource)streamAudioSource).PlayOneShot(voiceClip);
             }
             catch (Exception e)
             {