Skip to content
This repository was archived by the owner on Feb 23, 2025. It is now read-only.
This repository was archived by the owner on Feb 23, 2025. It is now read-only.

Voice Activation #25

@RicardoEPRodrigues

Description

@RicardoEPRodrigues

Hey,

I saw you had voice activation planned for this tool. I ended up implementing a basic solution on my end and wanted to share it with you.

In my use case, I really needed an open mic where the player spoke when they wanted. So I implemented voice detection through changes in the intensity of the audio (not perfect but it kinda works). I leave the code below if you want to try it out.

In essence, I pick up the data points from the CapturableSoundWave and create an average over time. If for a given time frame, the values deviate enough from the average, then I activate the voice recording. The recording stops after a second if the data points return to the average (the player stopped speaking).

Header

#pragma once

#include "CoreMinimal.h"
#include "SpeechRecognizer.h"
#include "MyRecorderSpeechRecognition.generated.h"

/**
 * 
 */
UCLASS(Blueprintable)
class MY_API UMyRecorderSpeechRecognition : public UObject
{
	GENERATED_BODY()

public:
	virtual void BeginDestroy() override;

	virtual void Init();
	virtual void StartAudioSession(bool bIsMuted);
	virtual void StopAudioSession();
	virtual void SetAudioSessionMute(const bool bIsMuted);
	virtual bool GetAudioSessionMute();

	/**
	 * Audio Input Device ID (aka which microphone to use).
	 */
	UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT")
	int DeviceId = 0;

	/**
	 * Window of captured audio data to be stored prior to voice activation.
	 */
	UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
	int VoiceActivationWindow = 5;

	/**
	 * Time to wait after voice activation and before sending the audio data to be recognized.
	 */
	UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
	float VoiceActivationDelay = 1.0f;

	/**
	 * Multiplier of the deviation to detect a significant shift in volume to trigger voice activation.
	 */
	UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
	float VoiceActivationDeviationMultiplier = 2.0f;

	/**
	 * Minimum value the deviation can have to offer better results. (Most relevant in quiet environments.)
	 */
	UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
	float VoiceActivationMinDeviation = 0.5f;

protected:

	UPROPERTY()
	TArray<float> AudioData;

	float ActiveAverage = -1;
	float ActiveDeviation = -1;
	
	// STT Section
	UPROPERTY()
	bool IsAudioSessionMuted = false;

	UPROPERTY(BlueprintReadWrite)
	class USpeechRecognizer* SpeechRecognizer;

	UPROPERTY(BlueprintReadWrite)
	class UCapturableSoundWave* CapturableSoundWave;

	UPROPERTY()
	FOnSpeechRecognitionStartedDynamic OnStartSpeechRecognitionEvent;

	UFUNCTION()
	void OnRecognitionFinished();
	UFUNCTION()
	void OnRecognitionError(const FString& ShortErrorMessage, const FString& LongErrorMessage);
	UFUNCTION()
	void OnRecognizedTextSegment(const FString& RecognizedWords);

	UFUNCTION()
	void OnStartSpeechRecognition(bool bSucceeded);

	UFUNCTION()
	void OnPopulateAudioData(const TArray<float>& PopulatedAudioData);

	UFUNCTION()
	void Reset();

	bool IsVoiceActivated = false;
	
	FTimerDelegate VoiceActivationDelegate{};
	FTimerHandle VoiceActivationTimerHandle{};

	UFUNCTION()
	void OnVoiceActivation();
};

Code

// Fill out your copyright notice in the Description page of Project Settings.


#include "MyRecorderSpeechRecognition.h"

#include "Sound/CapturableSoundWave.h"

void UMyRecorderSpeechRecognition::BeginDestroy()
{
	if (SpeechRecognizer)
	{
		SpeechRecognizer->StopSpeechRecognition();
		SpeechRecognizer->OnRecognitionFinished.RemoveDynamic(
			this, &UMyRecorderSpeechRecognition::OnRecognitionFinished);
		SpeechRecognizer->OnRecognitionError.
		                  RemoveDynamic(this, &UMyRecorderSpeechRecognition::OnRecognitionError);
		SpeechRecognizer->OnRecognizedTextSegment.RemoveDynamic(
			this, &UMyRecorderSpeechRecognition::OnRecognizedTextSegment);
	}

	OnStartSpeechRecognitionEvent.Clear();

	if (CapturableSoundWave)
	{
		CapturableSoundWave->StopCapture();
		CapturableSoundWave->OnPopulateAudioData.RemoveDynamic(
			this, &UMyRecorderSpeechRecognition::OnPopulateAudioData);
	}

	if (GetWorld() && GetWorld()->GetTimerManager().IsTimerActive(VoiceActivationTimerHandle))
	{
		GetWorld()->GetTimerManager().ClearTimer(VoiceActivationTimerHandle);
	}

	Super::BeginDestroy();
}

void UMyRecorderSpeechRecognition::Init()
{
	SpeechRecognizer = USpeechRecognizer::CreateSpeechRecognizer();
	SpeechRecognizer->SetLanguage(ESpeechRecognizerLanguage::En);
	SpeechRecognizer->OnRecognitionFinished.AddUniqueDynamic(
		this, &UMyRecorderSpeechRecognition::OnRecognitionFinished);
	SpeechRecognizer->OnRecognitionError.AddUniqueDynamic(this, &UMyRecorderSpeechRecognition::OnRecognitionError);
	SpeechRecognizer->OnRecognizedTextSegment.AddUniqueDynamic(
		this, &UMyRecorderSpeechRecognition::OnRecognizedTextSegment);
	SpeechRecognizer->SetStreamingDefaults();
	SpeechRecognizer->SetSuppressBlank(true);
	SpeechRecognizer->SetSuppressNonSpeechTokens(true);
	SpeechRecognizer->SetNumOfThreads(0);
	SpeechRecognizer->SetStepSize(0);

	OnStartSpeechRecognitionEvent.BindDynamic(this, &UMyRecorderSpeechRecognition::OnStartSpeechRecognition);

	CapturableSoundWave = UCapturableSoundWave::CreateCapturableSoundWave();
	CapturableSoundWave->OnPopulateAudioData.AddUniqueDynamic(
		this, &UMyRecorderSpeechRecognition::OnPopulateAudioData);
}

void UMyRecorderSpeechRecognition::StartAudioSession(bool bIsMuted)
{
	if (!SpeechRecognizer)
	{
		UE_LOG(LogTemp, Error, TEXT("Unable to start audio session. Speech Recognizer is not defined."));
		return;
	}

	SpeechRecognizer->StartSpeechRecognition(OnStartSpeechRecognitionEvent);
	IsAudioSessionMuted = bIsMuted;
}

void UMyRecorderSpeechRecognition::StopAudioSession()
{
	if (CapturableSoundWave)
	{
		CapturableSoundWave->StopCapture();
	}
	if (SpeechRecognizer)
	{
		SpeechRecognizer->StopSpeechRecognition();
	}
	Reset();
}

void UMyRecorderSpeechRecognition::SetAudioSessionMute(const bool bIsMuted)
{
	if (IsAudioSessionMuted == bIsMuted)
	{
		return;
	}

	IsAudioSessionMuted = bIsMuted;
	if (!CapturableSoundWave)
	{
		return;
	}

	CapturableSoundWave->ToggleMute(IsAudioSessionMuted);
}

bool UMyRecorderSpeechRecognition::GetAudioSessionMute()
{
	return IsAudioSessionMuted;
}

void UMyRecorderSpeechRecognition::OnRecognitionFinished()
{
}

void UMyRecorderSpeechRecognition::OnRecognitionError(const FString& ShortErrorMessage,
                                                           const FString& LongErrorMessage)
{
	UE_LOG(LogTemp, Error, TEXT("Speech Recognition Error. %s. %s"), *ShortErrorMessage, *LongErrorMessage);
}

void UMyRecorderSpeechRecognition::OnRecognizedTextSegment(const FString& RecognizedWords)
{
	// Send text to game.
}

void UMyRecorderSpeechRecognition::OnStartSpeechRecognition(bool bSucceeded)
{
	if (!CapturableSoundWave)
	{
		UE_LOG(LogTemp, Error, TEXT("Unable to start speech recognition. CapturableSoundWave is not defined."));
		return;
	}

	Reset();

	CapturableSoundWave->StartCapture(DeviceId);
	if (!IsAudioSessionMuted) SetAudioSessionMute(IsAudioSessionMuted);
}

float MathSumAbs(const TArray<float>& Population)
{
	float Std = 0;

	for (const auto Data : Population)
	{
		Std += FMath::Abs(Data);
	}
	return Std;
}

void UMyRecorderSpeechRecognition::OnPopulateAudioData(const TArray<float>& PopulatedAudioData)
{
	if (!SpeechRecognizer)
	{
		UE_LOG(LogTemp, Error, TEXT("Unable to process audio data. SpeechRecognizer is not defined."));
		return;
	}
	if (!CapturableSoundWave)
	{
		UE_LOG(LogTemp, Error, TEXT("Unable to process audio data. CapturableSoundWave is not defined."));
		return;
	}

	const float Sum = MathSumAbs(PopulatedAudioData);
	const float Deviation = FMath::Abs(ActiveAverage - Sum);

	if (ActiveAverage <= 0)
	{
		ActiveAverage = Sum;
		ActiveDeviation = FMath::Max(Deviation, VoiceActivationMinDeviation);
		return;
	}

	if (Sum > ActiveAverage + (ActiveDeviation * VoiceActivationDeviationMultiplier))
	{
		FTimerManager& TimerManager = GetWorld()->GetTimerManager();
		if (TimerManager.IsTimerActive(VoiceActivationTimerHandle))
		{
			TimerManager.ClearTimer(VoiceActivationTimerHandle);
		}
		else
		{
			UE_LOG(LogTemp, Log, TEXT("Voice Activation: ON"));
		}

		VoiceActivationDelegate.BindUObject(this, &UMyRecorderSpeechRecognition::OnVoiceActivation);
		TimerManager.SetTimer(VoiceActivationTimerHandle, VoiceActivationDelegate,
		                      VoiceActivationDelay, false);
		IsVoiceActivated = true;
	}

	ActiveAverage = (ActiveAverage * 0.3f) + (Sum * 0.7f);
	ActiveDeviation = FMath::Max((ActiveDeviation + Deviation) * 0.5f, VoiceActivationMinDeviation);

	if (!IsVoiceActivated)
	{
		const int WindowNum = PopulatedAudioData.Num() * (VoiceActivationWindow - 1);
		if (AudioData.Num() > WindowNum)
		{
			const int NumDataToRemove = AudioData.Num() - WindowNum;
			AudioData.RemoveAt(0, NumDataToRemove);
		}
	}
	AudioData.Append(PopulatedAudioData);

	// SpeechRecognizer->ProcessAudioData(PopulatedAudioData, CapturableSoundWave->GetSampleRate(),
	//                                    CapturableSoundWave->GetNumOfChannels(), false);
}

void UMyRecorderSpeechRecognition::Reset()
{
	ActiveAverage = -1;
	ActiveDeviation = VoiceActivationMinDeviation;

	IsVoiceActivated = false;

	AudioData.Empty();
}

void UMyRecorderSpeechRecognition::OnVoiceActivation()
{
	UE_LOG(LogTemp, Log, TEXT("Voice Activation: OFF"));

	SpeechRecognizer->ProcessAudioData(AudioData, CapturableSoundWave->GetSampleRate(),
	                                   CapturableSoundWave->GetNumOfChannels(), true);

	AudioData.Empty();
	IsVoiceActivated = false;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions