com.rest.elevenlabs 3.5.0 (#117)

- added TextToSpeechRequest.ctr overload - added seed property - added applyTextNormalization property - removed depricated optimizeStreamingLatency property - added VoiceSettings.ctr overload - added speed property - fix audio decoding bug in GeneratedClip - added TextToSpeechAsync overload with Func(VoiceClip, Task) callback - updated TextToSpeechDemo with StreamAudioSource example - updated deps - com.utilities.rest -> 3.3.1 - com.utilities.audio -> 2.2.1 - com.utilities.encoder.wav -> 2.2.0 - com.utilities.encoder.ogg -> 4.2.0
RageAgainstThePixel · Mar 2, 2025 · e7c08f9 · e7c08f9
1 parent 689ad11
commit e7c08f9
Show file tree

Hide file tree

Showing 14 changed files with 348 additions and 152 deletions.
diff --git a/.github/workflows/unity.yml b/.github/workflows/unity.yml
@@ -6,35 +6,71 @@ on:
     branches:
       - 'main'
   pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
     branches:
-      - '*'
+      - '**'
   workflow_dispatch:
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ (github.event_name == 'pull_request' || github.event.action == 'synchronize') }}
+  cancel-in-progress: ${{(github.event_name == 'pull_request' || github.event.action == 'synchronize')}}
 jobs:
   build:
-    permissions:
-      checks: write
-      pull-requests: write
+    name: ${{ matrix.os }} ${{ matrix.unity-version }} ${{ matrix.build-target }}
     runs-on: ${{ matrix.os }}
+    if: github.event_name != 'pull_request' || !github.event.pull_request.draft
+    permissions:
+      contents: read
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-15]
-        unity-versions: [2021.x, 2022.x, 6000.x]
-        include:
+        include: # for each os specify the build targets
+          - os: ubuntu-latest
+            unity-version: 2022.x
+            build-target: Android
+          - os: ubuntu-latest
+            unity-version: 6000.x
+            build-target: Android
           - os: ubuntu-latest
+            unity-version: 2022.x
             build-target: StandaloneLinux64
+          - os: ubuntu-latest
+            unity-version: 6000.x
+            build-target: StandaloneLinux64
+          - os: ubuntu-latest
+            unity-version: 2022.x
+            build-target: WebGL
+          - os: ubuntu-latest
+            unity-version: 6000.x
+            build-target: WebGL
+          - os: windows-latest
+            unity-version: 2022.x
+            build-target: StandaloneWindows64
           - os: windows-latest
+            unity-version: 6000.x
             build-target: StandaloneWindows64
-          - os: macos-15
+          - os: windows-latest
+            unity-version: 2022.x
+            build-target: WSAPlayer
+          - os: windows-latest
+            unity-version: 6000.x
+            build-target: WSAPlayer
+          - os: macos-latest
+            unity-version: 2022.x
+            build-target: iOS
+          - os: macos-latest
+            unity-version: 6000.x
+            build-target: iOS
+          - os: macos-latest
+            unity-version: 2022.x
+            build-target: StandaloneOSX
+          - os: macos-latest
+            unity-version: 6000.x
             build-target: StandaloneOSX
     steps:
       - uses: actions/checkout@v4
       - uses: RageAgainstThePixel/unity-setup@v1
         with:
-          unity-version: ${{ matrix.unity-versions }}
+          unity-version: ${{ matrix.unity-version }}
           build-targets: ${{ matrix.build-target }}
       - uses: RageAgainstThePixel/activate-unity-license@v1
         with:
@@ -56,5 +92,5 @@ jobs:
       - uses: actions/upload-artifact@v4
         if: success() || failure()
         with:
-          name: '${{ github.run_number }}.${{ github.run_attempt }}-${{ matrix.os }}-${{ matrix.unity-versions }}-${{ matrix.build-target }}-Artifacts'
+          name: '${{ github.run_number }}.${{ github.run_attempt }}-${{ matrix.os }}-${{ matrix.unity-version }}-${{ matrix.build-target }}-Artifacts'
           path: '${{ github.workspace }}/**/*.log'
diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Documentation~/README.md b/ElevenLabs/Packages/com.rest.elevenlabs/Documentation~/README.md
@@ -18,6 +18,14 @@ The recommended installation method is though the unity package manager and [Ope
 
 ### Via Unity Package Manager and OpenUPM
 
+#### Terminal
+
+```bash
+openupm add com.rest.elevenlabs
+```
+
+#### Manual
+
 - Open your Unity project settings
 - Select the `Package Manager`
 ![scoped-registries](images/package-manager-scopes.png)
@@ -33,9 +41,11 @@ The recommended installation method is though the unity package manager and [Ope
 
 ### Via Unity Package Manager and Git url
 
+> [!WARNING]
+> This repo has dependencies on other repositories! You are responsible for adding these on your own.
+
 - Open your Unity Package Manager
 - Add package from git url: `https://github.com/RageAgainstThePixel/com.rest.elevenlabs.git#upm`
-  > Note: this repo has dependencies on other repositories! You are responsible for adding these on your own.
   - [com.utilities.async](https://github.com/RageAgainstThePixel/com.utilities.async)
   - [com.utilities.extensions](https://github.com/RageAgainstThePixel/com.utilities.extensions)
   - [com.utilities.audio](https://github.com/RageAgainstThePixel/com.utilities.audio)
@@ -45,7 +55,11 @@ The recommended installation method is though the unity package manager and [Ope
 
 ---
 
-## Documentation
+## [Documentation](https://rageagainstthepixel.github.io/ElevenLabs-DotNet)
+
+> Check out our new api docs!
+
+<https://rageagainstthepixel.github.io/ElevenLabs-DotNet>
 
 ### Table of Contents
 
@@ -290,7 +304,7 @@ var request = new TextToSpeechRequest(voice, message, model: Model.EnglishTurboV
 var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(request, partialClip =>
 {
     // Note: check demo scene for best practices
-    // on how to handle playback with OnAudioFilterRead
+    // on how to handle playback with StreamAudioSource.cs
     partialClips.Enqueue(partialClip);
 });
 ```

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Editor/ElevenLabsDashboard.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Editor/ElevenLabsDashboard.cs
@@ -1011,7 +1011,7 @@ private void RenderSpeechSynthesis()
 
                 if (EditorGUI.EndChangeCheck())
                 {
-                    currentVoiceSettings = new VoiceSettings(voiceSettingsSliderValues.x, voiceSettingsSliderValues.y, useSpeakerBoost, voiceSettingsSliderValues.z);
+                    currentVoiceSettings = new VoiceSettings(voiceSettingsSliderValues.x, voiceSettingsSliderValues.y, voiceSettingsSliderValues.z, useSpeakerBoost);
                 }
 
                 EditorGUILayout.EndHorizontal();

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Common/GeneratedClip.cs
@@ -38,8 +38,6 @@ internal GeneratedClip(string id, string text, ReadOnlyMemory<byte> clipData, in
             SampleRate = sampleRate;
         }
 
-        private readonly ReadOnlyMemory<byte> audioData;
-
         [SerializeField]
         private string id;
 
@@ -66,11 +64,15 @@ public AudioClip AudioClip
         {
             get
             {
-                if (audioClip == null && !audioData.IsEmpty)
+                if (audioClip == null && !ClipData.IsEmpty)
                 {
-                    var pcmData = PCMEncoder.Decode(audioData.ToArray());
-                    audioClip = AudioClip.Create(Id, pcmData.Length, 1, SampleRate, false);
-                    audioClip.SetData(pcmData, 0);
+                    var samples = ClipSamples;
+
+                    if (samples is { Length: > 0 })
+                    {
+                        audioClip = AudioClip.Create(Id, samples.Length, 1, SampleRate, false);
+                        audioClip.SetData(samples, 0);
+                    }
                 }
 
                 if (audioClip == null)
@@ -98,7 +100,7 @@ public float[] ClipSamples
             {
                 if (!ClipData.IsEmpty)
                 {
-                    clipSamples ??= PCMEncoder.Decode(ClipData.ToArray());
+                    clipSamples ??= PCMEncoder.Decode(ClipData.ToArray(), PCMFormatSize.SixteenBit, SampleRate, AudioSettings.outputSampleRate);
                 }
                 else if (audioClip != null)
                 {

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs
@@ -104,12 +104,7 @@ public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, Canc
                 audioData = response.Data;
             }
 
-            string cachedPath = null;
-
-            if (request.CacheFormat != CacheFormat.None)
-            {
-                cachedPath = await SaveAudioToCache(audioData, clipId, request.Voice, request.OutputFormat, request.CacheFormat, cancellationToken).ConfigureAwait(true);
-            }
+            var cachedPath = await SaveAudioToCache(audioData, clipId, request.Voice, request.OutputFormat, request.CacheFormat, cancellationToken).ConfigureAwait(true);
 
             return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory<byte>(audioData), request.OutputFormat.GetSampleRate(), cachedPath)
             {
@@ -125,6 +120,22 @@ public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, Canc
         /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
         /// <returns><see cref="VoiceClip"/>.</returns>
         public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, Action<VoiceClip> partialClipCallback, CancellationToken cancellationToken = default)
+        {
+            return await TextToSpeechAsync(request, async voiceClip =>
+            {
+                partialClipCallback.Invoke(voiceClip);
+                await Task.Yield();
+            }, cancellationToken).ConfigureAwait(false);
+        }
+
+        /// <summary>
+        /// Converts text to synthesized speech.
+        /// </summary>
+        /// <param name="request"><see cref="TextToSpeechRequest"/>.</param>
+        /// <param name="partialClipCallback">Partial <see cref="VoiceClip"/> callback with streaming data.</param>
+        /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
+        /// <returns><see cref="VoiceClip"/>.</returns>
+        public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, Func<VoiceClip, Task> partialClipCallback, CancellationToken cancellationToken = default)
         {
             if (request.OutputFormat is not OutputFormat.PCM_16000 and not OutputFormat.PCM_22050 and not OutputFormat.PCM_24000 and not OutputFormat.PCM_44100)
             {
@@ -167,19 +178,14 @@ public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, Acti
             }
 
             var audioData = request.WithTimestamps ? accumulatedPCMData!.ToArray() : response.Data;
-            string cachedPath = null;
-
-            if (request.CacheFormat != CacheFormat.None)
-            {
-                cachedPath = await SaveAudioToCache(audioData, clipId, request.Voice, request.OutputFormat, request.CacheFormat, cancellationToken).ConfigureAwait(true);
-            }
+            var cachedPath = await SaveAudioToCache(audioData, clipId, request.Voice, request.OutputFormat, request.CacheFormat, cancellationToken).ConfigureAwait(true);
 
             return new VoiceClip(clipId, request.Text, request.Voice, new ReadOnlyMemory<byte>(audioData), request.OutputFormat.GetSampleRate(), cachedPath)
             {
                 TimestampedTranscriptCharacters = accumulatedTranscriptData?.ToArray() ?? Array.Empty<TimestampedTranscriptCharacter>()
             };
 
-            void StreamCallback(Response partialResponse)
+            async void StreamCallback(Response partialResponse)
             {
                 try
                 {
@@ -188,7 +194,7 @@ void StreamCallback(Response partialResponse)
                         throw new ArgumentException("Failed to parse clip id!");
                     }
 
-                    partialClipCallback.Invoke(new VoiceClip($"{clipId}_{++part}", request.Text, request.Voice, new ReadOnlyMemory<byte>(partialResponse.Data), frequency));
+                    await partialClipCallback.Invoke(new VoiceClip($"{clipId}_{++part}", request.Text, request.Voice, new ReadOnlyMemory<byte>(partialResponse.Data), frequency));
                 }
                 catch (Exception e)
                 {
@@ -264,16 +270,24 @@ private static Dictionary<string, string> CreateRequestParameters(TextToSpeechRe
                 { OutputFormatParameter, request.OutputFormat.ToString().ToLower() }
             };
 
+#pragma warning disable CS0618 // Type or member is obsolete
             if (request.OptimizeStreamingLatency.HasValue)
             {
                 parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString());
             }
+#pragma warning restore CS0618 // Type or member is obsolete
 
             return parameters;
         }
 
         private static async Task<string> SaveAudioToCache(byte[] audioData, string clipId, Voice voice, OutputFormat outputFormat, CacheFormat cacheFormat, CancellationToken cancellationToken)
         {
+#if PLATFORM_WEBGL
+            await Task.Yield();
+            return null;
+#else
+            if (cacheFormat == CacheFormat.None) { return null; }
+
             string extension;
             AudioType audioType;
 
@@ -294,7 +308,6 @@ private static async Task<string> SaveAudioToCache(byte[] audioData, string clip
                         extension = "ogg";
                         audioType = AudioType.OGGVORBIS;
                         break;
-                    case CacheFormat.None:
                     default:
                         throw new ArgumentOutOfRangeException(nameof(cacheFormat), cacheFormat, null);
                 }
@@ -325,6 +338,7 @@ private static async Task<string> SaveAudioToCache(byte[] audioData, string clip
             }
 
             return cachedPath;
+#endif // PLATFORM_WEBGL
         }
 
     }