diff --git a/Platform_FVP_Corstone_SSE-300_Ethos-U55/microspeech.Example.Helium.cprj b/Platform_FVP_Corstone_SSE-300_Ethos-U55/microspeech.Example.Helium.cprj
new file mode 100644
index 0000000..1b68214
--- /dev/null
+++ b/Platform_FVP_Corstone_SSE-300_Ethos-U55/microspeech.Example.Helium.cprj
@@ -0,0 +1,133 @@
+
+
+
+
+
+
+ Blinky
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ../VSI/audio/include;../VSI/include;../micro_speech/src
+ __FVP_PY
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/micro_speech/src/microfrontend/lib/fft.cc b/micro_speech/src/microfrontend/lib/fft.cc
index 0f309b8..5ea1f8a 100644
--- a/micro_speech/src/microfrontend/lib/fft.cc
+++ b/micro_speech/src/microfrontend/lib/fft.cc
@@ -19,6 +19,7 @@ limitations under the License.
#define FIXED_POINT 16
#include "kiss_fft.h"
#include "tools/kiss_fftr.h"
+#include
void FftCompute(struct FftState* state, const int16_t* input,
int input_scale_shift) {
@@ -38,9 +39,15 @@ void FftCompute(struct FftState* state, const int16_t* input,
}
// Apply the FFT.
+#ifdef USE_KISS_FFT
kiss_fftr(reinterpret_cast(state->scratch),
state->input,
reinterpret_cast(state->output));
+#else
+ arm_rfft_q15(reinterpret_cast(state->scratch),
+ state->input,
+ reinterpret_cast(state->output));
+#endif
}
void FftInit(struct FftState* state) {
diff --git a/micro_speech/src/microfrontend/lib/fft_util.cc b/micro_speech/src/microfrontend/lib/fft_util.cc
index 9f7b4c5..46ffa1b 100644
--- a/micro_speech/src/microfrontend/lib/fft_util.cc
+++ b/micro_speech/src/microfrontend/lib/fft_util.cc
@@ -20,6 +20,8 @@ limitations under the License.
#include "kiss_fft.h"
#include "tools/kiss_fftr.h"
+#include "arm_math.h"
+
int FftPopulateState(struct FftState* state, size_t input_size) {
state->input_size = input_size;
state->fft_size = 1;
@@ -41,6 +43,21 @@ int FftPopulateState(struct FftState* state, size_t input_size) {
return 0;
}
+#ifndef USE_KISS_FFT
+ arm_rfft_instance_q15 * cmsisFft;
+
+ cmsisFft = (arm_rfft_instance_q15 *)malloc(sizeof(arm_rfft_instance_q15));
+ if (cmsisFft == nullptr) {
+ fprintf(stderr, "Failed to alloc cmsis fft context\n");
+ return 0;
+ }
+ if (arm_rfft_init_q15(cmsisFft, state->fft_size, 0, 1) != ARM_MATH_SUCCESS) {
+ fprintf(stderr, "Failed to init cmsis fft \n");
+ return 0;
+ }
+
+ state->scratch = cmsisFft;
+#else
// Ask kissfft how much memory it wants.
size_t scratch_size = 0;
kiss_fftr_cfg kfft_cfg = kiss_fftr_alloc(
@@ -62,6 +79,7 @@ int FftPopulateState(struct FftState* state, size_t input_size) {
fprintf(stderr, "Kiss memory preallocation strategy failed.\n");
return 0;
}
+#endif
return 1;
}
diff --git a/micro_speech/src/microfrontend/lib/filterbank.c b/micro_speech/src/microfrontend/lib/filterbank.c
index 4ca79c5..f2010e6 100644
--- a/micro_speech/src/microfrontend/lib/filterbank.c
+++ b/micro_speech/src/microfrontend/lib/filterbank.c
@@ -18,6 +18,108 @@ limitations under the License.
#include "microfrontend/lib/bits.h"
+#ifdef __ARM_FEATURE_MVE
+
+#include
+#include "arm_math.h"
+
+
+#if (__ARM_FEATURE_MVE & 2)
+#define INVSQRT_MAGIC_F32 0x5f3759df
+
+__STATIC_INLINE f32x4_t visqrtf_f32(
+ f32x4_t vecIn)
+{
+ int32x4_t vecNewtonInit = vdupq_n_s32(INVSQRT_MAGIC_F32);
+ f32x4_t vecOneHandHalf = vdupq_n_f32(1.5f);
+ f32x4_t vecDst;
+ f32x4_t vecHalf;
+ int32x4_t vecTmpInt;
+ f32x4_t vecTmpFlt, vecTmpFlt1;
+
+
+ vecHalf = vmulq_n_f32(vecIn, 0.500001f);
+
+ /*
+ * cast input float vector to integer and right shift by 1
+ */
+ vecTmpInt = vshrq_n_s32((int32x4_t) vecIn, 1);
+ /*
+ * INVSQRT_MAGIC - ((vec_q32_t)vecIn >> 1)
+ */
+ vecTmpInt = vsubq(vecNewtonInit, vecTmpInt);
+ /*
+ *------------------------------
+ * 1st iteration
+ *------------------------------
+ * (1.5f-xhalf*x*x)
+ */
+ vecTmpFlt1 = vmulq((f32x4_t) vecTmpInt, (f32x4_t) vecTmpInt);
+ vecTmpFlt1 = vmulq(vecTmpFlt1, vecHalf);
+ vecTmpFlt1 = vsubq(vecOneHandHalf, vecTmpFlt1);
+ /*
+ * x = x*(1.5f-xhalf*x*x);
+ */
+ vecTmpFlt = vmulq((f32x4_t) vecTmpInt, vecTmpFlt1);
+
+ /*
+ *------------------------------
+ * 2nd iteration
+ *------------------------------
+ */
+ vecTmpFlt1 = vmulq(vecTmpFlt, vecTmpFlt);
+ vecTmpFlt1 = vmulq(vecTmpFlt1, vecHalf);
+ vecTmpFlt1 = vsubq(vecOneHandHalf, vecTmpFlt1);
+ vecDst = vmulq(vecTmpFlt, vecTmpFlt1);
+ /*
+ * set negative values to NAN
+ */
+ vecDst = vdupq_m(vecDst, NAN, vcmpltq(vecIn, 0.0f));
+ vecDst = vdupq_m(vecDst, INFINITY, vcmpeqq(vecIn, 0.0f));
+ return vecDst;
+}
+
+__STATIC_FORCEINLINE f32x4_t vsqrtf_f32(
+ f32x4_t vecIn)
+{
+ f32x4_t vecDst;
+
+ /* inverse square root unsing 2 newton iterations */
+ vecDst = visqrtf_f32(vecIn);
+ vecDst = vdupq_m(vecDst, 0.0f, vcmpeqq(vecIn, 0.0f));
+ vecDst = vecDst * vecIn;
+ return vecDst;
+}
+
+#endif
+
+
+static void arm_cmplx_lmag_squared_q15(
+ const int16_t * pSrc,
+ int32_t * pDst,
+ uint32_t numSamples)
+{
+ int32_t blkSize = numSamples;
+ int16x8_t vecSrc;
+ vecSrc = vld1q(pSrc);
+ pSrc += 8;
+
+ do {
+ mve_pred16_t p = vctp32q(blkSize);
+
+ vst1q_p(pDst,
+ vaddq_x(vmullbq_int(vecSrc, vecSrc), vmulltq_int(vecSrc, vecSrc), p), p);
+ vecSrc = vld1q_z(pSrc, p);
+
+ blkSize -= 4;
+ pSrc += 8;
+ pDst += 4;
+ }
+ while (blkSize > 0);
+}
+
+#endif
+
void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
struct complex_int16_t* fft_output,
int32_t* energy) {
@@ -25,6 +127,8 @@ void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
int i;
energy += state->start_index;
fft_output += state->start_index;
+
+#ifndef __ARM_FEATURE_MVE
for (i = state->start_index; i < end_index; ++i) {
const int32_t real = fft_output->real;
const int32_t imag = fft_output->imag;
@@ -32,6 +136,9 @@ void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
const uint32_t mag_squared = (real * real) + (imag * imag);
*energy++ = mag_squared;
}
+#else
+ arm_cmplx_lmag_squared_q15(&fft_output->real, energy, end_index - state->start_index);
+#endif
}
void FilterbankAccumulateChannels(struct FilterbankState* state,
@@ -46,6 +153,7 @@ void FilterbankAccumulateChannels(struct FilterbankState* state,
int num_channels_plus_1 = state->num_channels + 1;
int i;
+#ifndef __ARM_FEATURE_MVE
for (i = 0; i < num_channels_plus_1; ++i) {
const int32_t* magnitudes = energy + *channel_frequency_starts++;
const int16_t* weights = state->weights + *channel_weight_starts;
@@ -61,6 +169,34 @@ void FilterbankAccumulateChannels(struct FilterbankState* state,
weight_accumulator = unweight_accumulator;
unweight_accumulator = 0;
}
+#else
+ uint32_t* work32 = (uint32_t*)work;
+
+ for (i = 0; i < num_channels_plus_1; ++i) {
+ const int32_t* magnitudes = energy + *channel_frequency_starts++;
+ const int16_t* weights = state->weights + *channel_weight_starts;
+ const int16_t* unweights = state->unweights + *channel_weight_starts++;
+ const int width = *channel_widths++;
+ int j;
+
+ for (j = 0; j < width/4; ++j) {
+ weight_accumulator = vmlaldavaq(weight_accumulator, vld1q(magnitudes), vldrhq_s32(weights));
+ unweight_accumulator = vmlaldavaq(unweight_accumulator, vld1q(magnitudes), vldrhq_s32(unweights));
+
+ magnitudes += 4;
+ weights+=4;
+ unweights+=4;
+ }
+
+#if !(__ARM_FEATURE_MVE & 2)
+ *work++ = weight_accumulator;
+#else
+ *work32++ = asrl(weight_accumulator, 16);;
+#endif
+ weight_accumulator = unweight_accumulator;
+ unweight_accumulator = 0;
+ }
+#endif
}
static uint16_t Sqrt32(uint32_t num) {
@@ -116,6 +252,20 @@ static uint32_t Sqrt64(uint64_t num) {
return res;
}
+uint32_t* FilterbankSqrt1(struct FilterbankState* state, int scale_down_shift) {
+ const int num_channels = state->num_channels;
+ const uint64_t* work = state->work + 1;
+ // Reuse the work buffer since we're fine clobbering it at this point to hold
+ // the output.
+ uint32_t* output = (uint32_t*)state->work;
+ int i;
+ for (i = 0; i < num_channels; ++i) {
+ *output++ = Sqrt64(*work++) >> scale_down_shift;
+ }
+ return (uint32_t*)state->work;
+}
+
+
uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) {
const int num_channels = state->num_channels;
const uint64_t* work = state->work + 1;
@@ -123,9 +273,29 @@ uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) {
// the output.
uint32_t* output = (uint32_t*)state->work;
int i;
+
+#if !(__ARM_FEATURE_MVE & 2)
for (i = 0; i < num_channels; ++i) {
*output++ = Sqrt64(*work++) >> scale_down_shift;
}
+#else
+ const uint32_t* work32 = (uint32_t*)(state->work);//
+ // jump over 1st bin
+ work32 = work32 + 1;
+
+ float32_t scale = powf(2.0f, 8-scale_down_shift);
+
+ for (i = 0; i < num_channels/4; ++i) {
+ int32x4_t vsrc = vld1q(work32);
+ f32x4_t vsrcf = vcvtq_f32_s32(vsrc);
+ f32x4_t vdst = vsqrtf_f32(vsrcf);
+
+ vstrwq_u32(output, vcvtpq_s32_f32(vdst*scale));
+ output+=4;
+ work32+=4;
+ }
+
+#endif
return (uint32_t*)state->work;
}
diff --git a/micro_speech/src/microfrontend/lib/window.c b/micro_speech/src/microfrontend/lib/window.c
index 00a3f42..7343637 100644
--- a/micro_speech/src/microfrontend/lib/window.c
+++ b/micro_speech/src/microfrontend/lib/window.c
@@ -16,6 +16,49 @@ limitations under the License.
#include
+#ifdef __ARM_FEATURE_MVE
+
+#include
+
+static int16_t arm_win_process_samples_mve(const int16_t * pSrc,
+ const int16_t * pCoef, uint32_t blockSize, int16_t * pResult)
+{
+ int32_t blkSize = blockSize;
+ int16x8_t curExtremValVec = vdupq_n_s16(0);
+ int16_t maxValue = 0;
+
+ int16x8_t vecSrc = vld1q(pSrc);
+ pSrc += 8;
+
+ do {
+ mve_pred16_t p = vctp16q(blkSize);
+ int16x8_t vecDst, vecCoef;
+
+ vecCoef = vld1q_z(pCoef, p);
+
+ /* long multiply + narrowing */
+ vecDst = vuninitializedq_s16();
+ vecDst = vqshrnbq_m_n_s32(vecDst, vmullbq_int(vecSrc, vecCoef), kFrontendWindowBits, p);
+ vecDst = vqshrntq_m_n_s32(vecDst, vmulltq_int(vecSrc, vecCoef), kFrontendWindowBits, p);
+
+ vecSrc = vld1q_z(pSrc, p);
+
+ vst1q_p(pResult, vecDst, p);
+
+ curExtremValVec = vmaxq_m(vecDst, vabsq(vecDst), curExtremValVec, p);
+
+ blkSize -= 8;
+ pSrc += 8;
+ pCoef += 8;
+ pResult += 8;
+
+ }
+ while (blkSize > 0);
+
+ return (vmaxvq(maxValue, curExtremValVec));
+}
+#endif
+
int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
size_t num_samples, size_t* num_samples_read) {
const int size = state->size;
@@ -41,6 +84,9 @@ int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
int16_t* output = state->output;
int i;
int16_t max_abs_output_value = 0;
+#ifndef __ARM_FEATURE_MVE
+ int i;
+
for (i = 0; i < size; ++i) {
int16_t new_value =
(((int32_t)*input++) * *coefficients++) >> kFrontendWindowBits;
@@ -52,6 +98,10 @@ int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
max_abs_output_value = new_value;
}
}
+#else
+ max_abs_output_value = arm_win_process_samples_mve(input, coefficients, size, output);
+#endif
+
// Shuffle the input down by the step size, and update how much we have used.
memmove(state->input, state->input + state->step,
sizeof(*state->input) * (state->size - state->step));