This repository was archived by the owner on Feb 23, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 50
This repository was archived by the owner on Feb 23, 2025. It is now read-only.
Voice Activation #25
Copy link
Copy link
Open
Labels
enhancementNew feature or requestNew feature or request
Description
Hey,
I saw you had voice activation planned for this tool. I ended up implementing a basic solution on my end and wanted to share it with you.
In my use case, I really needed an open mic where the player spoke when they wanted. So I implemented voice detection through changes in the intensity of the audio (not perfect but it kinda works). I leave the code below if you want to try it out.
In essence, I pick up the data points from the CapturableSoundWave and create an average over time. If for a given time frame, the values deviate enough from the average, then I activate the voice recording. The recording stops after a second if the data points return to the average (the player stopped speaking).
Header
#pragma once
#include "CoreMinimal.h"
#include "SpeechRecognizer.h"
#include "MyRecorderSpeechRecognition.generated.h"
/**
*
*/
UCLASS(Blueprintable)
class MY_API UMyRecorderSpeechRecognition : public UObject
{
GENERATED_BODY()
public:
virtual void BeginDestroy() override;
virtual void Init();
virtual void StartAudioSession(bool bIsMuted);
virtual void StopAudioSession();
virtual void SetAudioSessionMute(const bool bIsMuted);
virtual bool GetAudioSessionMute();
/**
* Audio Input Device ID (aka which microphone to use).
*/
UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT")
int DeviceId = 0;
/**
* Window of captured audio data to be stored prior to voice activation.
*/
UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
int VoiceActivationWindow = 5;
/**
* Time to wait after voice activation and before sending the audio data to be recognized.
*/
UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
float VoiceActivationDelay = 1.0f;
/**
* Multiplier of the deviation to detect a significant shift in volume to trigger voice activation.
*/
UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
float VoiceActivationDeviationMultiplier = 2.0f;
/**
* Minimum value the deviation can have to offer better results. (Most relevant in quiet environments.)
*/
UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
float VoiceActivationMinDeviation = 0.5f;
protected:
UPROPERTY()
TArray<float> AudioData;
float ActiveAverage = -1;
float ActiveDeviation = -1;
// STT Section
UPROPERTY()
bool IsAudioSessionMuted = false;
UPROPERTY(BlueprintReadWrite)
class USpeechRecognizer* SpeechRecognizer;
UPROPERTY(BlueprintReadWrite)
class UCapturableSoundWave* CapturableSoundWave;
UPROPERTY()
FOnSpeechRecognitionStartedDynamic OnStartSpeechRecognitionEvent;
UFUNCTION()
void OnRecognitionFinished();
UFUNCTION()
void OnRecognitionError(const FString& ShortErrorMessage, const FString& LongErrorMessage);
UFUNCTION()
void OnRecognizedTextSegment(const FString& RecognizedWords);
UFUNCTION()
void OnStartSpeechRecognition(bool bSucceeded);
UFUNCTION()
void OnPopulateAudioData(const TArray<float>& PopulatedAudioData);
UFUNCTION()
void Reset();
bool IsVoiceActivated = false;
FTimerDelegate VoiceActivationDelegate{};
FTimerHandle VoiceActivationTimerHandle{};
UFUNCTION()
void OnVoiceActivation();
};Code
// Fill out your copyright notice in the Description page of Project Settings.
#include "MyRecorderSpeechRecognition.h"
#include "Sound/CapturableSoundWave.h"
void UMyRecorderSpeechRecognition::BeginDestroy()
{
if (SpeechRecognizer)
{
SpeechRecognizer->StopSpeechRecognition();
SpeechRecognizer->OnRecognitionFinished.RemoveDynamic(
this, &UMyRecorderSpeechRecognition::OnRecognitionFinished);
SpeechRecognizer->OnRecognitionError.
RemoveDynamic(this, &UMyRecorderSpeechRecognition::OnRecognitionError);
SpeechRecognizer->OnRecognizedTextSegment.RemoveDynamic(
this, &UMyRecorderSpeechRecognition::OnRecognizedTextSegment);
}
OnStartSpeechRecognitionEvent.Clear();
if (CapturableSoundWave)
{
CapturableSoundWave->StopCapture();
CapturableSoundWave->OnPopulateAudioData.RemoveDynamic(
this, &UMyRecorderSpeechRecognition::OnPopulateAudioData);
}
if (GetWorld() && GetWorld()->GetTimerManager().IsTimerActive(VoiceActivationTimerHandle))
{
GetWorld()->GetTimerManager().ClearTimer(VoiceActivationTimerHandle);
}
Super::BeginDestroy();
}
void UMyRecorderSpeechRecognition::Init()
{
SpeechRecognizer = USpeechRecognizer::CreateSpeechRecognizer();
SpeechRecognizer->SetLanguage(ESpeechRecognizerLanguage::En);
SpeechRecognizer->OnRecognitionFinished.AddUniqueDynamic(
this, &UMyRecorderSpeechRecognition::OnRecognitionFinished);
SpeechRecognizer->OnRecognitionError.AddUniqueDynamic(this, &UMyRecorderSpeechRecognition::OnRecognitionError);
SpeechRecognizer->OnRecognizedTextSegment.AddUniqueDynamic(
this, &UMyRecorderSpeechRecognition::OnRecognizedTextSegment);
SpeechRecognizer->SetStreamingDefaults();
SpeechRecognizer->SetSuppressBlank(true);
SpeechRecognizer->SetSuppressNonSpeechTokens(true);
SpeechRecognizer->SetNumOfThreads(0);
SpeechRecognizer->SetStepSize(0);
OnStartSpeechRecognitionEvent.BindDynamic(this, &UMyRecorderSpeechRecognition::OnStartSpeechRecognition);
CapturableSoundWave = UCapturableSoundWave::CreateCapturableSoundWave();
CapturableSoundWave->OnPopulateAudioData.AddUniqueDynamic(
this, &UMyRecorderSpeechRecognition::OnPopulateAudioData);
}
void UMyRecorderSpeechRecognition::StartAudioSession(bool bIsMuted)
{
if (!SpeechRecognizer)
{
UE_LOG(LogTemp, Error, TEXT("Unable to start audio session. Speech Recognizer is not defined."));
return;
}
SpeechRecognizer->StartSpeechRecognition(OnStartSpeechRecognitionEvent);
IsAudioSessionMuted = bIsMuted;
}
void UMyRecorderSpeechRecognition::StopAudioSession()
{
if (CapturableSoundWave)
{
CapturableSoundWave->StopCapture();
}
if (SpeechRecognizer)
{
SpeechRecognizer->StopSpeechRecognition();
}
Reset();
}
void UMyRecorderSpeechRecognition::SetAudioSessionMute(const bool bIsMuted)
{
if (IsAudioSessionMuted == bIsMuted)
{
return;
}
IsAudioSessionMuted = bIsMuted;
if (!CapturableSoundWave)
{
return;
}
CapturableSoundWave->ToggleMute(IsAudioSessionMuted);
}
bool UMyRecorderSpeechRecognition::GetAudioSessionMute()
{
return IsAudioSessionMuted;
}
void UMyRecorderSpeechRecognition::OnRecognitionFinished()
{
}
void UMyRecorderSpeechRecognition::OnRecognitionError(const FString& ShortErrorMessage,
const FString& LongErrorMessage)
{
UE_LOG(LogTemp, Error, TEXT("Speech Recognition Error. %s. %s"), *ShortErrorMessage, *LongErrorMessage);
}
void UMyRecorderSpeechRecognition::OnRecognizedTextSegment(const FString& RecognizedWords)
{
// Send text to game.
}
void UMyRecorderSpeechRecognition::OnStartSpeechRecognition(bool bSucceeded)
{
if (!CapturableSoundWave)
{
UE_LOG(LogTemp, Error, TEXT("Unable to start speech recognition. CapturableSoundWave is not defined."));
return;
}
Reset();
CapturableSoundWave->StartCapture(DeviceId);
if (!IsAudioSessionMuted) SetAudioSessionMute(IsAudioSessionMuted);
}
float MathSumAbs(const TArray<float>& Population)
{
float Std = 0;
for (const auto Data : Population)
{
Std += FMath::Abs(Data);
}
return Std;
}
void UMyRecorderSpeechRecognition::OnPopulateAudioData(const TArray<float>& PopulatedAudioData)
{
if (!SpeechRecognizer)
{
UE_LOG(LogTemp, Error, TEXT("Unable to process audio data. SpeechRecognizer is not defined."));
return;
}
if (!CapturableSoundWave)
{
UE_LOG(LogTemp, Error, TEXT("Unable to process audio data. CapturableSoundWave is not defined."));
return;
}
const float Sum = MathSumAbs(PopulatedAudioData);
const float Deviation = FMath::Abs(ActiveAverage - Sum);
if (ActiveAverage <= 0)
{
ActiveAverage = Sum;
ActiveDeviation = FMath::Max(Deviation, VoiceActivationMinDeviation);
return;
}
if (Sum > ActiveAverage + (ActiveDeviation * VoiceActivationDeviationMultiplier))
{
FTimerManager& TimerManager = GetWorld()->GetTimerManager();
if (TimerManager.IsTimerActive(VoiceActivationTimerHandle))
{
TimerManager.ClearTimer(VoiceActivationTimerHandle);
}
else
{
UE_LOG(LogTemp, Log, TEXT("Voice Activation: ON"));
}
VoiceActivationDelegate.BindUObject(this, &UMyRecorderSpeechRecognition::OnVoiceActivation);
TimerManager.SetTimer(VoiceActivationTimerHandle, VoiceActivationDelegate,
VoiceActivationDelay, false);
IsVoiceActivated = true;
}
ActiveAverage = (ActiveAverage * 0.3f) + (Sum * 0.7f);
ActiveDeviation = FMath::Max((ActiveDeviation + Deviation) * 0.5f, VoiceActivationMinDeviation);
if (!IsVoiceActivated)
{
const int WindowNum = PopulatedAudioData.Num() * (VoiceActivationWindow - 1);
if (AudioData.Num() > WindowNum)
{
const int NumDataToRemove = AudioData.Num() - WindowNum;
AudioData.RemoveAt(0, NumDataToRemove);
}
}
AudioData.Append(PopulatedAudioData);
// SpeechRecognizer->ProcessAudioData(PopulatedAudioData, CapturableSoundWave->GetSampleRate(),
// CapturableSoundWave->GetNumOfChannels(), false);
}
void UMyRecorderSpeechRecognition::Reset()
{
ActiveAverage = -1;
ActiveDeviation = VoiceActivationMinDeviation;
IsVoiceActivated = false;
AudioData.Empty();
}
void UMyRecorderSpeechRecognition::OnVoiceActivation()
{
UE_LOG(LogTemp, Log, TEXT("Voice Activation: OFF"));
SpeechRecognizer->ProcessAudioData(AudioData, CapturableSoundWave->GetSampleRate(),
CapturableSoundWave->GetNumOfChannels(), true);
AudioData.Empty();
IsVoiceActivated = false;
}Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request