From 02968edabcdefa92a78d27f115e08c3f7b235794 Mon Sep 17 00:00:00 2001 From: Sandor Vegh Date: Thu, 3 Oct 2024 16:08:21 +0200 Subject: [PATCH] Arm: Speed up -1..1 soft clipping with Neon If the signal exceeds -1..1 then, as error handling, the soft_clip function forces the signal back into -1..1. This is problematic since the search loop to find the next sample exceeding -1..1 is slow. If cheap on the current platform, while doing -2..2 hardclipping we can also detect if the signal never exceeds -1..1, avoiding the need for a second search loop. --- celt/arch.h | 2 + celt/arm/arm_celt_map.c | 8 ++++ celt/arm/celt_neon_intr.c | 77 ++++++++++++++++++++++++++++++++++ celt/arm/mathops_arm.h | 38 +++++++++++++++++ celt/mathops.c | 20 +++++++++ celt/mathops.h | 6 +++ celt/tests/test_unit_mathops.c | 72 +++++++++++++++++++++++++++++++ src/opus.c | 50 +++++++++++++++++----- src/opus_decoder.c | 2 +- src/opus_private.h | 2 + 10 files changed, 266 insertions(+), 11 deletions(-) diff --git a/celt/arch.h b/celt/arch.h index e8321ac85..ff2ee510f 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -103,6 +103,8 @@ void celt_fatal(const char *str, const char *file, int line) #define MAX32(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 32-bit value. */ #define IMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum int value. */ #define IMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum int value. */ +#define FMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum float value. */ +#define FMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum float value. */ #define UADD32(a,b) ((a)+(b)) #define USUB32(a,b) ((a)-(b)) #define MAXG(a,b) MAX32(a, b) diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c index d9980444e..931c011dd 100644 --- a/celt/arm/arm_celt_map.c +++ b/celt/arm/arm_celt_map.c @@ -46,6 +46,14 @@ void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT celt_float2int16_neon,/* NEON */ celt_float2int16_neon /* DOTPROD */ }; + +int (*const OPUS_LIMIT2_CHECKWITHIN1_IMPL[OPUS_ARCHMASK+1])(float * samples, int cnt) = { + opus_limit2_checkwithin1_c, /* ARMv4 */ + opus_limit2_checkwithin1_c, /* EDSP */ + opus_limit2_checkwithin1_c, /* Media */ + opus_limit2_checkwithin1_neon,/* NEON */ + opus_limit2_checkwithin1_neon /* DOTPROD */ +}; # endif # endif diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c index 32b6e5ac0..b87bebc11 100644 --- a/celt/arm/celt_neon_intr.c +++ b/celt/arm/celt_neon_intr.c @@ -86,8 +86,85 @@ void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out[i] = FLOAT2INT16(in[i]); } } + +int opus_limit2_checkwithin1_neon(float *samples, int cnt) +{ + const float hardclipMin = -2.0f; + const float hardclipMax = 2.0f; + + int i = 0; + int exceeding1 = 0; + int nextIndex = 0; + +#if defined(__ARM_NEON) + const int BLOCK_SIZE = 16; + const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE; + + float32x4_t min_all_0 = vdupq_n_f32(0.0f); + float32x4_t min_all_1 = vdupq_n_f32(0.0f); + float32x4_t max_all_0 = vdupq_n_f32(0.0f); + float32x4_t max_all_1 = vdupq_n_f32(0.0f); + + float max, min; + + for (i = 0; i < blockedSize; i += BLOCK_SIZE) + { + const float32x4_t orig_a = vld1q_f32(&samples[i + 0]); + const float32x4_t orig_b = vld1q_f32(&samples[i + 4]); + const float32x4_t orig_c = vld1q_f32(&samples[i + 8]); + const float32x4_t orig_d = vld1q_f32(&samples[i + 12]); + max_all_0 = vmaxq_f32(max_all_0, vmaxq_f32(orig_a, orig_b)); + max_all_1 = vmaxq_f32(max_all_1, vmaxq_f32(orig_c, orig_d)); + min_all_0 = vminq_f32(min_all_0, vminq_f32(orig_a, orig_b)); + min_all_1 = vminq_f32(min_all_1, vminq_f32(orig_c, orig_d)); + } + + max = vmaxvf(vmaxq_f32(max_all_0, max_all_1)); + min = vminvf(vminq_f32(min_all_0, min_all_1)); + + if (min < hardclipMin || max > hardclipMax) + { + const float32x4_t hardclipMinReg = vdupq_n_f32(hardclipMin); + const float32x4_t hardclipMaxReg = vdupq_n_f32(hardclipMax); + for (i = 0; i < blockedSize; i += BLOCK_SIZE) + { + const float32x4_t orig_a = vld1q_f32(&samples[i + 0]); + const float32x4_t orig_b = vld1q_f32(&samples[i + 4]); + const float32x4_t orig_c = vld1q_f32(&samples[i + 8]); + const float32x4_t orig_d = vld1q_f32(&samples[i + 12]); + const float32x4_t clipped_a = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_a, hardclipMinReg)); + const float32x4_t clipped_b = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_b, hardclipMinReg)); + const float32x4_t clipped_c = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_c, hardclipMinReg)); + const float32x4_t clipped_d = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_d, hardclipMinReg)); + vst1q_f32(&samples[i + 0], clipped_a); + vst1q_f32(&samples[i + 4], clipped_b); + vst1q_f32(&samples[i + 8], clipped_c); + vst1q_f32(&samples[i + 12], clipped_d); + } + } + + nextIndex = blockedSize; + exceeding1 |= max > 1.0f || min < -1.0f; + #endif + for (i = nextIndex; i < cnt; i++) + { + const float origVal = samples[i]; + float clippedVal = origVal; + clippedVal = MAX16(hardclipMin, clippedVal); + clippedVal = MIN16(hardclipMax, clippedVal); + samples[i] = clippedVal; + + exceeding1 |= origVal > 1.0f || origVal < -1.0f; + } + + return !exceeding1; +} + +#endif + + #if defined(FIXED_POINT) #include diff --git a/celt/arm/mathops_arm.h b/celt/arm/mathops_arm.h index ced719d32..b1f916997 100644 --- a/celt/arm/mathops_arm.h +++ b/celt/arm/mathops_arm.h @@ -46,6 +46,30 @@ static inline int32x4_t vroundf(float32x4_t x) # endif } +static inline float vminvf(float32x4_t a) +{ +#if defined(__aarch64__) + return vminvq_f32(a); +#else + float32x2_t xy = vmin_f32(vget_low_f32(a), vget_high_f32(a)); + float x = vget_lane_f32(xy, 0); + float y = vget_lane_f32(xy, 1); + return x < y ? x : y; +#endif +} + +static inline float vmaxvf(float32x4_t a) +{ +#if defined(__aarch64__) + return vmaxvq_f32(a); +#else + float32x2_t xy = vmax_f32(vget_low_f32(a), vget_high_f32(a)); + float x = vget_lane_f32(xy, 0); + float y = vget_lane_f32(xy, 1); + return x > y ? x : y; +#endif +} + void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt); # if defined(OPUS_HAVE_RTCD) && \ (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) @@ -60,6 +84,20 @@ extern void # define OVERRIDE_FLOAT2INT16 (1) # define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt)) # endif + +int opus_limit2_checkwithin1_neon(float * samples, int cnt); +# if defined(OPUS_HAVE_RTCD) && \ + (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) +extern int (*const OPUS_LIMIT2_CHECKWITHIN1_IMPL[OPUS_ARCHMASK+1])(float * samples, int cnt); + +# define OVERRIDE_LIMIT2_CHECKWITHIN1 (1) +# define opus_limit2_checkwithin1(samples, cnt, arch) \ + ((*OPUS_LIMIT2_CHECKWITHIN1_IMPL[(arch)&OPUS_ARCHMASK])(samples, cnt)) + +# elif defined(OPUS_ARM_PRESUME_NEON_INTR) +# define OVERRIDE_LIMIT2_CHECKWITHIN1 (1) +# define opus_limit2_checkwithin1(samples, cnt, arch) ((void)(arch), opus_limit2_checkwithin1_neon(samples, cnt)) +# endif # endif #endif /* MATHOPS_ARM_H */ diff --git a/celt/mathops.c b/celt/mathops.c index 0ad57ca71..574739ca4 100644 --- a/celt/mathops.c +++ b/celt/mathops.c @@ -229,4 +229,24 @@ void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT ou } } +int opus_limit2_checkwithin1_c(float * samples, int cnt) +{ + int i; + if (cnt <= 0) + { + return 1; + } + + for (i = 0; i < cnt; i++) + { + float clippedVal = samples[i]; + clippedVal = FMAX(-2.0f, clippedVal); + clippedVal = FMIN(2.0f, clippedVal); + samples[i] = clippedVal; + } + + /* C implementation can't provide quick hint. Assume it might exceed -1/+1. */ + return 0; +} + #endif /* DISABLE_FLOAT_API */ diff --git a/celt/mathops.h b/celt/mathops.h index 24dbfb4cc..6055c2189 100644 --- a/celt/mathops.h +++ b/celt/mathops.h @@ -490,6 +490,12 @@ void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT ou #define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt)) #endif +int opus_limit2_checkwithin1_c(float *samples, int cnt); + +#ifndef OVERRIDE_LIMIT2_CHECKWITHIN1 +#define opus_limit2_checkwithin1(samples, cnt, arch) ((void)(arch), opus_limit2_checkwithin1_c(samples, cnt)) +#endif + #endif /* DISABLE_FLOAT_API */ #endif /* MATHOPS_H */ diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index 98fcdec4b..f1e45e1bf 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -435,6 +435,77 @@ void testcelt_float2int16(int use_ref_impl, int buffer_size) #undef MAX_BUFFER_SIZE } +void testopus_limit2_checkwithin1(int use_ref_impl) +{ +#define BUFFER_SIZE 37 /* strange float count to trigger residue loop of SIMD implementation */ +#define BYTE_COUNT (BUFFER_SIZE * sizeof(float)) + int i, within1; + const int arch = opus_select_arch(); + + float pattern[BUFFER_SIZE], buffer[BUFFER_SIZE]; + + for (i = 0; i < BUFFER_SIZE; ++i) + { + pattern[i] = i % 2 ? -1.f : 1.f; + } + + /* All values within -1..1: + Nothing changed. Return value is implementation-dependent (not expected to recognise nothing exceeds -1..1) */ + memcpy(buffer, pattern, BYTE_COUNT); + within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch); + if (memcmp(buffer, pattern, BYTE_COUNT) != 0) + { + fprintf (stderr, "opus_limit2_checkwithin1() modified values not exceeding -1..1 (ref=%d)\n", use_ref_impl); + ret = 1; + } + + /* One value exceeds -1..1, within -2..2: + Values unchanged. Return value says not all values are within -1..1 */ + for (i = 0; i < BUFFER_SIZE; ++i) + { + const float replace_value = pattern[i] * 1.001f; + + memcpy(buffer, pattern, BYTE_COUNT); + buffer[i] = replace_value; + within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch); + if (within1 || buffer[i] != replace_value) + { + fprintf (stderr, "opus_limit2_checkwithin1() handled value exceeding -1..1 erroneously (ref=%d, i=%d)\n", use_ref_impl, i); + ret = 1; + } + buffer[i] = pattern[i]; + if (memcmp(buffer, pattern, BYTE_COUNT) != 0) + { + fprintf (stderr, "opus_limit2_checkwithin1() modified value within -2..2 (ref=%d, i=%d)\n", use_ref_impl, i); + ret = 1; + } + } + + /* One value exceeds -2..2: + One value is hardclipped, others are unchanged. Return value says not all values are within -1..1 */ + for (i = 0; i < BUFFER_SIZE; ++i) + { + const float replace_value = pattern[i] * 2.1; + + memcpy(buffer, pattern, BYTE_COUNT); + buffer[i] = replace_value; + within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch); + if (within1 || buffer[i] != (replace_value > 0.f ? 2.f : -2.f)) + { + fprintf (stderr, "opus_limit2_checkwithin1() handled value exceeding -2..2 erroneously (ref=%d, i=%d)\n", use_ref_impl, i); + ret = 1; + } + buffer[i] = pattern[i]; + if (memcmp(buffer, pattern, BYTE_COUNT) != 0) + { + fprintf (stderr, "opus_limit2_checkwithin1() modified value within -2..2 (ref=%d, i=%d)\n", use_ref_impl, i); + ret = 1; + } + } +#undef BUFFER_SIZE +#undef BYTE_COUNT +} + #endif int main(void) @@ -461,6 +532,7 @@ int main(void) testcelt_float2int16(use_ref_impl[i], 32); testcelt_float2int16(use_ref_impl[i], 127); testcelt_float2int16(use_ref_impl[i], 1031); + testopus_limit2_checkwithin1(use_ref_impl[i]); } #endif return ret; diff --git a/src/opus.c b/src/opus.c index 816a4dd5f..cfd4861d4 100644 --- a/src/opus.c +++ b/src/opus.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011 Xiph.Org Foundation, Skype Limited + Copyright (c) 2024 Arm Limited Written by Jean-Marc Valin and Koen Vos */ /* Redistribution and use in source and binary forms, with or without @@ -30,23 +31,40 @@ #endif #include "opus.h" +#include "celt/mathops.h" #include "opus_private.h" #ifndef DISABLE_FLOAT_API -OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem) + +void opus_pcm_soft_clip_impl(float *_x, int N, int C, float *declip_mem, int arch) { int c; int i; float *x; + int all_within_neg1pos1; if (C<1 || N<1 || !_x || !declip_mem) return; - /* First thing: saturate everything to +/- 2 which is the highest level our - non-linearity can handle. At the point where the signal reaches +/-2, - the derivative will be zero anyway, so this doesn't introduce any - discontinuity in the derivative. */ - for (i=0;i1 || x[i*C]<-1) - break; + i = N; + } else { + for (i=curr;i1 || x[i*C]<-1) + break; + } } if (i==N) { @@ -135,6 +159,12 @@ OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem) declip_mem[c] = a; } } + +OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem) +{ + opus_pcm_soft_clip_impl(_x, N, C, declip_mem, 0); +} + #endif int encode_size(int size, unsigned char *data) diff --git a/src/opus_decoder.c b/src/opus_decoder.c index 190221b7c..e94bea708 100644 --- a/src/opus_decoder.c +++ b/src/opus_decoder.c @@ -810,7 +810,7 @@ int opus_decode_native(OpusDecoder *st, const unsigned char *data, OPUS_PRINT_INT(nb_samples); #ifndef FIXED_POINT if (soft_clip) - opus_pcm_soft_clip(pcm, nb_samples, st->channels, st->softclip_mem); + opus_pcm_soft_clip_impl(pcm, nb_samples, st->channels, st->softclip_mem, st->arch); else st->softclip_mem[0]=st->softclip_mem[1]=0; #endif diff --git a/src/opus_private.h b/src/opus_private.h index 19deb6ee2..c673c3e98 100644 --- a/src/opus_private.h +++ b/src/opus_private.h @@ -177,6 +177,8 @@ void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int void downmix_int24(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C); int is_digital_silence(const opus_res* pcm, int frame_size, int channels, int lsb_depth); +void opus_pcm_soft_clip_impl(float *_x, int N, int C, float *declip_mem, int arch); + int encode_size(int size, unsigned char *data); opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs);