From 31ff109b2a33f1bda3c66d9ec2c7d9895cddd3eb Mon Sep 17 00:00:00 2001 From: Stephen Hodgson Date: Sat, 8 Mar 2025 18:16:01 -0500 Subject: [PATCH 1/2] com.rest.elevenlabs 3.5.1 - Fixed generated clip playback for non-streaming clips - Updated usages of Task.Delay with WebGL friendly Awaiters.DelayAsync - Updated TextToSpeechDemo - Updated Deps - Updated Unit Tests --- .gitignore | 1 + .../Runtime/Common/GeneratedClip.cs | 61 ++++++++++--------- .../Runtime/Dubbing/DubbingEndpoint.cs | 3 +- .../TextToSpeech/TextToSpeechEndpoint.cs | 2 +- .../Samples~/TextToSpeech/TextToSpeechDemo.cs | 18 +++++- .../Test_Fixture_04_TextToSpeechEndpoint.cs | 15 +++-- .../Packages/com.rest.elevenlabs/package.json | 9 ++- ElevenLabs/Packages/manifest.json | 2 +- 8 files changed, 68 insertions(+), 43 deletions(-) diff --git a/.gitignore b/.gitignore index bd4dcd4..b2653a0 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,7 @@ TextMesh Pro/ UIElementsSchema/ *packages-lock.json ProjectSettings/SceneTemplateSettings.json +boot.config # ============ # # Certificates # diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs index 6b8e1f9..4ea450f 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs @@ -57,62 +57,61 @@ internal GeneratedClip(string id, string text, ReadOnlyMemory clipData, in public Guid TextHash { get; private set; } [SerializeField] - private AudioClip audioClip; + private string cachedPath; [Preserve] - public AudioClip AudioClip + public string CachedPath => cachedPath; + + public ReadOnlyMemory ClipData { get; } + + public float[] ClipSamples { get { - if (audioClip == null && !ClipData.IsEmpty) + if (clipSamples != null) { - var samples = ClipSamples; - - if (samples is { Length: > 0 }) - { - audioClip = AudioClip.Create(Id, samples.Length, 1, SampleRate, false); - audioClip.SetData(samples, 0); - } + return clipSamples; } - if (audioClip == null) + if (ClipData.IsEmpty) { - Debug.LogError($"{nameof(audioClip)} is null, try loading it with LoadCachedAudioClipAsync"); + return Array.Empty(); } - return audioClip; + clipSamples ??= PCMEncoder.Decode(ClipData.ToArray(), inputSampleRate: SampleRate, outputSampleRate: AudioSettings.outputSampleRate); + return clipSamples; + } } + private float[] clipSamples; + + public int SampleRate { get; } [SerializeField] - private string cachedPath; + private AudioClip audioClip; [Preserve] - public string CachedPath => cachedPath; - - public ReadOnlyMemory ClipData { get; } - - private float[] clipSamples; - - public float[] ClipSamples + public AudioClip AudioClip { get { - if (!ClipData.IsEmpty) + if (audioClip == null && + ClipSamples is { Length: > 0 }) { - clipSamples ??= PCMEncoder.Decode(ClipData.ToArray(), PCMFormatSize.SixteenBit, SampleRate, AudioSettings.outputSampleRate); + audioClip = AudioClip.Create(Id, ClipSamples.Length, 1, AudioSettings.outputSampleRate, false); + audioClip.SetData(ClipSamples, 0); } - else if (audioClip != null) + + if (audioClip == null) { - clipSamples = new float[audioClip.samples]; - audioClip.GetData(clipSamples, 0); + Debug.LogError($"{nameof(audioClip)} is null, try loading it with LoadCachedAudioClipAsync"); } - return clipSamples; + return audioClip; } } - public int SampleRate { get; } + public float Length => ClipSamples.Length / (float)AudioSettings.outputSampleRate; public void OnBeforeSerialize() => textHash = TextHash.ToString(); @@ -130,6 +129,12 @@ var path when path.EndsWith(".mp3") => AudioType.MPEG, _ => AudioType.UNKNOWN }; + if (audioType == AudioType.UNKNOWN) + { + Debug.LogWarning($"Unable to load cached audio clip at {cachedPath}"); + return null; + } + return await Rest.DownloadAudioClipAsync($"file://{cachedPath}", audioType, cancellationToken: cancellationToken); } } diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs index cc4883e..3bae0e0 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Dubbing/DubbingEndpoint.cs @@ -10,6 +10,7 @@ using System.Threading; using System.Threading.Tasks; using UnityEngine; +using Utilities.Async; using Utilities.WebRequestRest; using Debug = UnityEngine.Debug; @@ -148,7 +149,7 @@ private async Task WaitForDubbingCompletionAsync(Dubbing Debug.Log($"Dubbing for {dubbingResponse.DubbingId} in progress... Will check status again in {pollingInterval.TotalSeconds} seconds."); } - await Task.Delay(pollingInterval, cancellationToken).ConfigureAwait(false); + await Awaiters.DelayAsync(pollingInterval, cancellationToken).ConfigureAwait(true); } else { diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs index cf6acb5..9eafcc4 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs @@ -180,7 +180,7 @@ public async Task TextToSpeechAsync(TextToSpeechRequest request, Func var audioData = request.WithTimestamps ? accumulatedPCMData!.ToArray() : response.Data; var cachedPath = await SaveAudioToCache(audioData, clipId, request.Voice, request.OutputFormat, request.CacheFormat, cancellationToken).ConfigureAwait(true); - return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(audioData), request.OutputFormat.GetSampleRate(), cachedPath) + return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory(audioData), frequency, cachedPath) { TimestampedTranscriptCharacters = accumulatedTranscriptData?.ToArray() ?? Array.Empty() }; diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs index 15eb60d..7548c23 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs @@ -4,12 +4,14 @@ using ElevenLabs.TextToSpeech; using ElevenLabs.Voices; using System; +using System.Diagnostics; using System.Linq; using System.Threading; using System.Threading.Tasks; using UnityEngine; using Utilities.Async; using Utilities.Audio; +using Debug = UnityEngine.Debug; namespace ElevenLabs.Demo { @@ -63,18 +65,30 @@ private async void Start() } var request = new TextToSpeechRequest(voice, message, model: Model.FlashV2_5, outputFormat: OutputFormat.PCM_24000); + var stopwatch = Stopwatch.StartNew(); var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(request, async partialClip => { await streamAudioSource.BufferCallbackAsync(partialClip.ClipSamples); }, cancellationToken: destroyCancellationToken); - await new WaitUntil(() => streamAudioSource.IsEmpty || destroyCancellationToken.IsCancellationRequested); - destroyCancellationToken.ThrowIfCancellationRequested(); + var elapsedTime = (float)stopwatch.Elapsed.TotalSeconds; + var playbackTime = voiceClip.Length - elapsedTime; + + if (debug) + { + Debug.Log($"Elapsed time: {elapsedTime:F} seconds"); + Debug.Log($"voice clip length: {voiceClip.Length:F} seconds"); + Debug.Log($"playback time: {playbackTime:F} seconds"); + } + + await Awaiters.DelayAsync(TimeSpan.FromSeconds(playbackTime + 0.1f), destroyCancellationToken); ((AudioSource)streamAudioSource).clip = voiceClip.AudioClip; if (debug) { Debug.Log($"Full clip: {voiceClip.Id}"); } + + ((AudioSource)streamAudioSource).Play(); } catch (Exception e) { diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs index 8865178..f10a81c 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Tests/Test_Fixture_04_TextToSpeechEndpoint.cs @@ -21,6 +21,7 @@ public async Task Test_01_TextToSpeech() var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request); Assert.NotNull(voiceClip); Assert.NotNull(voiceClip.AudioClip); + Assert.AreEqual(voiceClip.AudioClip.length, voiceClip.Length, 0.01); Debug.Log(voiceClip.Id); } @@ -36,9 +37,10 @@ public async Task Test_02_StreamTextToSpeech() Assert.NotNull(partialClips); Assert.IsNotEmpty(partialClips); Assert.NotNull(voiceClip); - Assert.IsNotNull(voiceClip.AudioClip); Debug.Log(voiceClip.Id); Debug.Log(voiceClip.CachedPath); + Assert.IsNotNull(voiceClip.AudioClip); + Assert.AreEqual(voiceClip.AudioClip.length, voiceClip.Length, 0.01); } [Test] @@ -50,9 +52,10 @@ public async Task Test_03_TextToSpeech_Transcription() var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", withTimestamps: true); var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request); Assert.NotNull(voiceClip); - Assert.NotNull(voiceClip.AudioClip); Debug.Log(voiceClip.Id); Debug.Log(voiceClip.CachedPath); + Assert.NotNull(voiceClip.AudioClip); + Assert.AreEqual(voiceClip.AudioClip.length, voiceClip.Length, 0.01); Assert.NotNull(voiceClip.TimestampedTranscriptCharacters); Assert.IsNotEmpty(voiceClip.TimestampedTranscriptCharacters); Debug.Log("| Character | Start Time | End Time |"); @@ -88,9 +91,10 @@ public async Task Test_04_StreamTextToSpeech_Transcription() Assert.NotNull(partialClips); Assert.IsNotEmpty(partialClips); Assert.NotNull(voiceClip); - Assert.IsNotNull(voiceClip.AudioClip); Debug.Log(voiceClip.Id); Debug.Log(voiceClip.CachedPath); + Assert.IsNotNull(voiceClip.AudioClip); + Assert.AreEqual(voiceClip.AudioClip.length, voiceClip.Length, 0.01); Assert.AreEqual(characters.ToArray(), voiceClip.TimestampedTranscriptCharacters); } @@ -111,10 +115,11 @@ public async Task Test_05_LanguageEnforced_TextToSpeech() languageCode: "cs"); var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request); Assert.NotNull(voiceClip); - Assert.NotNull(voiceClip.AudioClip); - Assert.IsTrue(string.IsNullOrWhiteSpace(voiceClip.CachedPath)); Debug.Log(voiceClip.Id); Debug.Log(voiceClip.CachedPath); + Assert.NotNull(voiceClip.AudioClip); + Assert.AreEqual(voiceClip.AudioClip.length, voiceClip.Length, 0.01); + Assert.IsTrue(string.IsNullOrWhiteSpace(voiceClip.CachedPath)); } } } diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/package.json b/ElevenLabs/Packages/com.rest.elevenlabs/package.json index 85496d7..8d411e4 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/package.json +++ b/ElevenLabs/Packages/com.rest.elevenlabs/package.json @@ -3,7 +3,7 @@ "displayName": "ElevenLabs", "description": "A non-official Eleven Labs voice synthesis RESTful client.", "keywords": [], - "version": "3.5.0", + "version": "3.5.1", "unity": "2021.3", "documentationUrl": "https://github.com/RageAgainstThePixel/com.rest.elevenlabs#documentation", "changelogUrl": "https://github.com/RageAgainstThePixel/com.rest.elevenlabs/releases", @@ -17,10 +17,9 @@ "url": "https://github.com/StephenHodgson" }, "dependencies": { - "com.utilities.rest": "3.3.1", - "com.utilities.audio": "2.2.1", - "com.utilities.encoder.ogg": "4.2.0", - "com.utilities.encoder.wav": "2.2.0" + "com.utilities.rest": "3.3.2", + "com.utilities.encoder.ogg": "4.2.1", + "com.utilities.encoder.wav": "2.2.1" }, "samples": [ { diff --git a/ElevenLabs/Packages/manifest.json b/ElevenLabs/Packages/manifest.json index 8a73013..8d7c4fd 100644 --- a/ElevenLabs/Packages/manifest.json +++ b/ElevenLabs/Packages/manifest.json @@ -3,7 +3,7 @@ "com.unity.ide.rider": "3.0.34", "com.unity.ide.visualstudio": "2.0.22", "com.unity.test-framework": "1.3.5", - "com.utilities.buildpipeline": "1.6.0" + "com.utilities.buildpipeline": "1.6.1" }, "scopedRegistries": [ { From e22f2ef387d380a8648b250f5b5f835881b8f3df Mon Sep 17 00:00:00 2001 From: Stephen Hodgson Date: Sat, 8 Mar 2025 18:25:50 -0500 Subject: [PATCH 2/2] play as oneshot --- .../Samples~/TextToSpeech/TextToSpeechDemo.cs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs index 7548c23..9482734 100644 --- a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs +++ b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs @@ -80,15 +80,14 @@ private async void Start() Debug.Log($"playback time: {playbackTime:F} seconds"); } - await Awaiters.DelayAsync(TimeSpan.FromSeconds(playbackTime + 0.1f), destroyCancellationToken); - ((AudioSource)streamAudioSource).clip = voiceClip.AudioClip; + await Awaiters.DelayAsync(TimeSpan.FromSeconds(playbackTime + 1f), destroyCancellationToken); if (debug) { Debug.Log($"Full clip: {voiceClip.Id}"); } - ((AudioSource)streamAudioSource).Play(); + ((AudioSource)streamAudioSource).PlayOneShot(voiceClip); } catch (Exception e) {