From cca4c9fa096f98fb8aee85acefa0673a23fb47eb Mon Sep 17 00:00:00 2001 From: FabKlein Date: Wed, 15 Sep 2021 09:22:17 +0000 Subject: [PATCH] Replaced KissFFT with CMSIS DSP FFT and added Helium optimizations in the audio frontend (microspeech.Example.Helium.cprj project) --- .../microspeech.Example.Helium.cprj | 133 ++++++++++++++ micro_speech/src/microfrontend/lib/fft.cc | 7 + .../src/microfrontend/lib/fft_util.cc | 18 ++ .../src/microfrontend/lib/filterbank.c | 170 ++++++++++++++++++ micro_speech/src/microfrontend/lib/window.c | 50 ++++++ 5 files changed, 378 insertions(+) create mode 100644 Platform_FVP_Corstone_SSE-300_Ethos-U55/microspeech.Example.Helium.cprj diff --git a/Platform_FVP_Corstone_SSE-300_Ethos-U55/microspeech.Example.Helium.cprj b/Platform_FVP_Corstone_SSE-300_Ethos-U55/microspeech.Example.Helium.cprj new file mode 100644 index 0000000..1b68214 --- /dev/null +++ b/Platform_FVP_Corstone_SSE-300_Ethos-U55/microspeech.Example.Helium.cprj @@ -0,0 +1,133 @@ + + + + + + + Blinky + + + + + + + + + + + + + + + + + + + + + + + + + ../VSI/audio/include;../VSI/include;../micro_speech/src + __FVP_PY + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/micro_speech/src/microfrontend/lib/fft.cc b/micro_speech/src/microfrontend/lib/fft.cc index f605b1d..2897237 100644 --- a/micro_speech/src/microfrontend/lib/fft.cc +++ b/micro_speech/src/microfrontend/lib/fft.cc @@ -19,6 +19,7 @@ limitations under the License. #define FIXED_POINT 16 #include "kiss_fft.h" #include "tools/kiss_fftr.h" +#include void FftCompute(struct FftState* state, const int16_t* input, int input_scale_shift) { @@ -38,9 +39,15 @@ void FftCompute(struct FftState* state, const int16_t* input, } // Apply the FFT. +#ifdef USE_KISS_FFT kiss_fftr(reinterpret_cast(state->scratch), state->input, reinterpret_cast(state->output)); +#else + arm_rfft_q15(reinterpret_cast(state->scratch), + state->input, + reinterpret_cast(state->output)); +#endif } void FftInit(struct FftState* state) { diff --git a/micro_speech/src/microfrontend/lib/fft_util.cc b/micro_speech/src/microfrontend/lib/fft_util.cc index 95d618a..4808820 100644 --- a/micro_speech/src/microfrontend/lib/fft_util.cc +++ b/micro_speech/src/microfrontend/lib/fft_util.cc @@ -20,6 +20,8 @@ limitations under the License. #include "kiss_fft.h" #include "tools/kiss_fftr.h" +#include "arm_math.h" + int FftPopulateState(struct FftState* state, size_t input_size) { state->input_size = input_size; state->fft_size = 1; @@ -41,6 +43,21 @@ int FftPopulateState(struct FftState* state, size_t input_size) { return 0; } +#ifndef USE_KISS_FFT + arm_rfft_instance_q15 * cmsisFft; + + cmsisFft = (arm_rfft_instance_q15 *)malloc(sizeof(arm_rfft_instance_q15)); + if (cmsisFft == nullptr) { + fprintf(stderr, "Failed to alloc cmsis fft context\n"); + return 0; + } + if (arm_rfft_init_q15(cmsisFft, state->fft_size, 0, 1) != ARM_MATH_SUCCESS) { + fprintf(stderr, "Failed to init cmsis fft \n"); + return 0; + } + + state->scratch = cmsisFft; +#else // Ask kissfft how much memory it wants. size_t scratch_size = 0; kiss_fftr_cfg kfft_cfg = kiss_fftr_alloc( @@ -62,6 +79,7 @@ int FftPopulateState(struct FftState* state, size_t input_size) { fprintf(stderr, "Kiss memory preallocation strategy failed.\n"); return 0; } +#endif return 1; } diff --git a/micro_speech/src/microfrontend/lib/filterbank.c b/micro_speech/src/microfrontend/lib/filterbank.c index 4ca79c5..f2010e6 100644 --- a/micro_speech/src/microfrontend/lib/filterbank.c +++ b/micro_speech/src/microfrontend/lib/filterbank.c @@ -18,6 +18,108 @@ limitations under the License. #include "microfrontend/lib/bits.h" +#ifdef __ARM_FEATURE_MVE + +#include +#include "arm_math.h" + + +#if (__ARM_FEATURE_MVE & 2) +#define INVSQRT_MAGIC_F32 0x5f3759df + +__STATIC_INLINE f32x4_t visqrtf_f32( + f32x4_t vecIn) +{ + int32x4_t vecNewtonInit = vdupq_n_s32(INVSQRT_MAGIC_F32); + f32x4_t vecOneHandHalf = vdupq_n_f32(1.5f); + f32x4_t vecDst; + f32x4_t vecHalf; + int32x4_t vecTmpInt; + f32x4_t vecTmpFlt, vecTmpFlt1; + + + vecHalf = vmulq_n_f32(vecIn, 0.500001f); + + /* + * cast input float vector to integer and right shift by 1 + */ + vecTmpInt = vshrq_n_s32((int32x4_t) vecIn, 1); + /* + * INVSQRT_MAGIC - ((vec_q32_t)vecIn >> 1) + */ + vecTmpInt = vsubq(vecNewtonInit, vecTmpInt); + /* + *------------------------------ + * 1st iteration + *------------------------------ + * (1.5f-xhalf*x*x) + */ + vecTmpFlt1 = vmulq((f32x4_t) vecTmpInt, (f32x4_t) vecTmpInt); + vecTmpFlt1 = vmulq(vecTmpFlt1, vecHalf); + vecTmpFlt1 = vsubq(vecOneHandHalf, vecTmpFlt1); + /* + * x = x*(1.5f-xhalf*x*x); + */ + vecTmpFlt = vmulq((f32x4_t) vecTmpInt, vecTmpFlt1); + + /* + *------------------------------ + * 2nd iteration + *------------------------------ + */ + vecTmpFlt1 = vmulq(vecTmpFlt, vecTmpFlt); + vecTmpFlt1 = vmulq(vecTmpFlt1, vecHalf); + vecTmpFlt1 = vsubq(vecOneHandHalf, vecTmpFlt1); + vecDst = vmulq(vecTmpFlt, vecTmpFlt1); + /* + * set negative values to NAN + */ + vecDst = vdupq_m(vecDst, NAN, vcmpltq(vecIn, 0.0f)); + vecDst = vdupq_m(vecDst, INFINITY, vcmpeqq(vecIn, 0.0f)); + return vecDst; +} + +__STATIC_FORCEINLINE f32x4_t vsqrtf_f32( + f32x4_t vecIn) +{ + f32x4_t vecDst; + + /* inverse square root unsing 2 newton iterations */ + vecDst = visqrtf_f32(vecIn); + vecDst = vdupq_m(vecDst, 0.0f, vcmpeqq(vecIn, 0.0f)); + vecDst = vecDst * vecIn; + return vecDst; +} + +#endif + + +static void arm_cmplx_lmag_squared_q15( + const int16_t * pSrc, + int32_t * pDst, + uint32_t numSamples) +{ + int32_t blkSize = numSamples; + int16x8_t vecSrc; + vecSrc = vld1q(pSrc); + pSrc += 8; + + do { + mve_pred16_t p = vctp32q(blkSize); + + vst1q_p(pDst, + vaddq_x(vmullbq_int(vecSrc, vecSrc), vmulltq_int(vecSrc, vecSrc), p), p); + vecSrc = vld1q_z(pSrc, p); + + blkSize -= 4; + pSrc += 8; + pDst += 4; + } + while (blkSize > 0); +} + +#endif + void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state, struct complex_int16_t* fft_output, int32_t* energy) { @@ -25,6 +127,8 @@ void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state, int i; energy += state->start_index; fft_output += state->start_index; + +#ifndef __ARM_FEATURE_MVE for (i = state->start_index; i < end_index; ++i) { const int32_t real = fft_output->real; const int32_t imag = fft_output->imag; @@ -32,6 +136,9 @@ void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state, const uint32_t mag_squared = (real * real) + (imag * imag); *energy++ = mag_squared; } +#else + arm_cmplx_lmag_squared_q15(&fft_output->real, energy, end_index - state->start_index); +#endif } void FilterbankAccumulateChannels(struct FilterbankState* state, @@ -46,6 +153,7 @@ void FilterbankAccumulateChannels(struct FilterbankState* state, int num_channels_plus_1 = state->num_channels + 1; int i; +#ifndef __ARM_FEATURE_MVE for (i = 0; i < num_channels_plus_1; ++i) { const int32_t* magnitudes = energy + *channel_frequency_starts++; const int16_t* weights = state->weights + *channel_weight_starts; @@ -61,6 +169,34 @@ void FilterbankAccumulateChannels(struct FilterbankState* state, weight_accumulator = unweight_accumulator; unweight_accumulator = 0; } +#else + uint32_t* work32 = (uint32_t*)work; + + for (i = 0; i < num_channels_plus_1; ++i) { + const int32_t* magnitudes = energy + *channel_frequency_starts++; + const int16_t* weights = state->weights + *channel_weight_starts; + const int16_t* unweights = state->unweights + *channel_weight_starts++; + const int width = *channel_widths++; + int j; + + for (j = 0; j < width/4; ++j) { + weight_accumulator = vmlaldavaq(weight_accumulator, vld1q(magnitudes), vldrhq_s32(weights)); + unweight_accumulator = vmlaldavaq(unweight_accumulator, vld1q(magnitudes), vldrhq_s32(unweights)); + + magnitudes += 4; + weights+=4; + unweights+=4; + } + +#if !(__ARM_FEATURE_MVE & 2) + *work++ = weight_accumulator; +#else + *work32++ = asrl(weight_accumulator, 16);; +#endif + weight_accumulator = unweight_accumulator; + unweight_accumulator = 0; + } +#endif } static uint16_t Sqrt32(uint32_t num) { @@ -116,6 +252,20 @@ static uint32_t Sqrt64(uint64_t num) { return res; } +uint32_t* FilterbankSqrt1(struct FilterbankState* state, int scale_down_shift) { + const int num_channels = state->num_channels; + const uint64_t* work = state->work + 1; + // Reuse the work buffer since we're fine clobbering it at this point to hold + // the output. + uint32_t* output = (uint32_t*)state->work; + int i; + for (i = 0; i < num_channels; ++i) { + *output++ = Sqrt64(*work++) >> scale_down_shift; + } + return (uint32_t*)state->work; +} + + uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) { const int num_channels = state->num_channels; const uint64_t* work = state->work + 1; @@ -123,9 +273,29 @@ uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) { // the output. uint32_t* output = (uint32_t*)state->work; int i; + +#if !(__ARM_FEATURE_MVE & 2) for (i = 0; i < num_channels; ++i) { *output++ = Sqrt64(*work++) >> scale_down_shift; } +#else + const uint32_t* work32 = (uint32_t*)(state->work);// + // jump over 1st bin + work32 = work32 + 1; + + float32_t scale = powf(2.0f, 8-scale_down_shift); + + for (i = 0; i < num_channels/4; ++i) { + int32x4_t vsrc = vld1q(work32); + f32x4_t vsrcf = vcvtq_f32_s32(vsrc); + f32x4_t vdst = vsqrtf_f32(vsrcf); + + vstrwq_u32(output, vcvtpq_s32_f32(vdst*scale)); + output+=4; + work32+=4; + } + +#endif return (uint32_t*)state->work; } diff --git a/micro_speech/src/microfrontend/lib/window.c b/micro_speech/src/microfrontend/lib/window.c index 00a3f42..7343637 100644 --- a/micro_speech/src/microfrontend/lib/window.c +++ b/micro_speech/src/microfrontend/lib/window.c @@ -16,6 +16,49 @@ limitations under the License. #include +#ifdef __ARM_FEATURE_MVE + +#include + +static int16_t arm_win_process_samples_mve(const int16_t * pSrc, + const int16_t * pCoef, uint32_t blockSize, int16_t * pResult) +{ + int32_t blkSize = blockSize; + int16x8_t curExtremValVec = vdupq_n_s16(0); + int16_t maxValue = 0; + + int16x8_t vecSrc = vld1q(pSrc); + pSrc += 8; + + do { + mve_pred16_t p = vctp16q(blkSize); + int16x8_t vecDst, vecCoef; + + vecCoef = vld1q_z(pCoef, p); + + /* long multiply + narrowing */ + vecDst = vuninitializedq_s16(); + vecDst = vqshrnbq_m_n_s32(vecDst, vmullbq_int(vecSrc, vecCoef), kFrontendWindowBits, p); + vecDst = vqshrntq_m_n_s32(vecDst, vmulltq_int(vecSrc, vecCoef), kFrontendWindowBits, p); + + vecSrc = vld1q_z(pSrc, p); + + vst1q_p(pResult, vecDst, p); + + curExtremValVec = vmaxq_m(vecDst, vabsq(vecDst), curExtremValVec, p); + + blkSize -= 8; + pSrc += 8; + pCoef += 8; + pResult += 8; + + } + while (blkSize > 0); + + return (vmaxvq(maxValue, curExtremValVec)); +} +#endif + int WindowProcessSamples(struct WindowState* state, const int16_t* samples, size_t num_samples, size_t* num_samples_read) { const int size = state->size; @@ -41,6 +84,9 @@ int WindowProcessSamples(struct WindowState* state, const int16_t* samples, int16_t* output = state->output; int i; int16_t max_abs_output_value = 0; +#ifndef __ARM_FEATURE_MVE + int i; + for (i = 0; i < size; ++i) { int16_t new_value = (((int32_t)*input++) * *coefficients++) >> kFrontendWindowBits; @@ -52,6 +98,10 @@ int WindowProcessSamples(struct WindowState* state, const int16_t* samples, max_abs_output_value = new_value; } } +#else + max_abs_output_value = arm_win_process_samples_mve(input, coefficients, size, output); +#endif + // Shuffle the input down by the step size, and update how much we have used. memmove(state->input, state->input + state->step, sizeof(*state->input) * (state->size - state->step));