Skip to content

Arm: Speed up -1..1 soft clipping with Neon #396

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions celt/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ void celt_fatal(const char *str, const char *file, int line)
#define MAX32(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 32-bit value. */
#define IMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum int value. */
#define IMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum int value. */
#define FMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum float value. */
#define FMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum float value. */
#define UADD32(a,b) ((a)+(b))
#define USUB32(a,b) ((a)-(b))
#define MAXG(a,b) MAX32(a, b)
Expand Down
8 changes: 8 additions & 0 deletions celt/arm/arm_celt_map.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT
celt_float2int16_neon,/* NEON */
celt_float2int16_neon /* DOTPROD */
};

int (*const OPUS_LIMIT2_CHECKWITHIN1_IMPL[OPUS_ARCHMASK+1])(float * samples, int cnt) = {
opus_limit2_checkwithin1_c, /* ARMv4 */
opus_limit2_checkwithin1_c, /* EDSP */
opus_limit2_checkwithin1_c, /* Media */
opus_limit2_checkwithin1_neon,/* NEON */
opus_limit2_checkwithin1_neon /* DOTPROD */
};
# endif
# endif

Expand Down
77 changes: 77 additions & 0 deletions celt/arm/celt_neon_intr.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,85 @@ void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT
out[i] = FLOAT2INT16(in[i]);
}
}

int opus_limit2_checkwithin1_neon(float *samples, int cnt)
{
const float hardclipMin = -2.0f;
const float hardclipMax = 2.0f;

int i = 0;
int exceeding1 = 0;
int nextIndex = 0;

#if defined(__ARM_NEON)
const int BLOCK_SIZE = 16;
const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;

float32x4_t min_all_0 = vdupq_n_f32(0.0f);
float32x4_t min_all_1 = vdupq_n_f32(0.0f);
float32x4_t max_all_0 = vdupq_n_f32(0.0f);
float32x4_t max_all_1 = vdupq_n_f32(0.0f);

float max, min;

for (i = 0; i < blockedSize; i += BLOCK_SIZE)
{
const float32x4_t orig_a = vld1q_f32(&samples[i + 0]);
const float32x4_t orig_b = vld1q_f32(&samples[i + 4]);
const float32x4_t orig_c = vld1q_f32(&samples[i + 8]);
const float32x4_t orig_d = vld1q_f32(&samples[i + 12]);
max_all_0 = vmaxq_f32(max_all_0, vmaxq_f32(orig_a, orig_b));
max_all_1 = vmaxq_f32(max_all_1, vmaxq_f32(orig_c, orig_d));
min_all_0 = vminq_f32(min_all_0, vminq_f32(orig_a, orig_b));
min_all_1 = vminq_f32(min_all_1, vminq_f32(orig_c, orig_d));
}

max = vmaxvf(vmaxq_f32(max_all_0, max_all_1));
min = vminvf(vminq_f32(min_all_0, min_all_1));

if (min < hardclipMin || max > hardclipMax)
{
const float32x4_t hardclipMinReg = vdupq_n_f32(hardclipMin);
const float32x4_t hardclipMaxReg = vdupq_n_f32(hardclipMax);
for (i = 0; i < blockedSize; i += BLOCK_SIZE)
{
const float32x4_t orig_a = vld1q_f32(&samples[i + 0]);
const float32x4_t orig_b = vld1q_f32(&samples[i + 4]);
const float32x4_t orig_c = vld1q_f32(&samples[i + 8]);
const float32x4_t orig_d = vld1q_f32(&samples[i + 12]);
const float32x4_t clipped_a = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_a, hardclipMinReg));
const float32x4_t clipped_b = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_b, hardclipMinReg));
const float32x4_t clipped_c = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_c, hardclipMinReg));
const float32x4_t clipped_d = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_d, hardclipMinReg));
vst1q_f32(&samples[i + 0], clipped_a);
vst1q_f32(&samples[i + 4], clipped_b);
vst1q_f32(&samples[i + 8], clipped_c);
vst1q_f32(&samples[i + 12], clipped_d);
}
}

nextIndex = blockedSize;
exceeding1 |= max > 1.0f || min < -1.0f;

#endif

for (i = nextIndex; i < cnt; i++)
{
const float origVal = samples[i];
float clippedVal = origVal;
clippedVal = MAX16(hardclipMin, clippedVal);
clippedVal = MIN16(hardclipMax, clippedVal);
samples[i] = clippedVal;

exceeding1 |= origVal > 1.0f || origVal < -1.0f;
}

return !exceeding1;
}

#endif


#if defined(FIXED_POINT)
#include <string.h>

Expand Down
38 changes: 38 additions & 0 deletions celt/arm/mathops_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,30 @@ static inline int32x4_t vroundf(float32x4_t x)
# endif
}

static inline float vminvf(float32x4_t a)
{
#if defined(__aarch64__)
return vminvq_f32(a);
#else
float32x2_t xy = vmin_f32(vget_low_f32(a), vget_high_f32(a));
float x = vget_lane_f32(xy, 0);
float y = vget_lane_f32(xy, 1);
return x < y ? x : y;
#endif
}

static inline float vmaxvf(float32x4_t a)
{
#if defined(__aarch64__)
return vmaxvq_f32(a);
#else
float32x2_t xy = vmax_f32(vget_low_f32(a), vget_high_f32(a));
float x = vget_lane_f32(xy, 0);
float y = vget_lane_f32(xy, 1);
return x > y ? x : y;
#endif
}

void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
Expand All @@ -60,6 +84,20 @@ extern void
# define OVERRIDE_FLOAT2INT16 (1)
# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
# endif

int opus_limit2_checkwithin1_neon(float * samples, int cnt);
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern int (*const OPUS_LIMIT2_CHECKWITHIN1_IMPL[OPUS_ARCHMASK+1])(float * samples, int cnt);

# define OVERRIDE_LIMIT2_CHECKWITHIN1 (1)
# define opus_limit2_checkwithin1(samples, cnt, arch) \
((*OPUS_LIMIT2_CHECKWITHIN1_IMPL[(arch)&OPUS_ARCHMASK])(samples, cnt))

# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_LIMIT2_CHECKWITHIN1 (1)
# define opus_limit2_checkwithin1(samples, cnt, arch) ((void)(arch), opus_limit2_checkwithin1_neon(samples, cnt))
# endif
# endif

#endif /* MATHOPS_ARM_H */
20 changes: 20 additions & 0 deletions celt/mathops.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,4 +229,24 @@ void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT ou
}
}

int opus_limit2_checkwithin1_c(float * samples, int cnt)
{
int i;
if (cnt <= 0)
{
return 1;
}

for (i = 0; i < cnt; i++)
{
float clippedVal = samples[i];
clippedVal = FMAX(-2.0f, clippedVal);
clippedVal = FMIN(2.0f, clippedVal);
samples[i] = clippedVal;
}

/* C implementation can't provide quick hint. Assume it might exceed -1/+1. */
return 0;
}

#endif /* DISABLE_FLOAT_API */
6 changes: 6 additions & 0 deletions celt/mathops.h
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,12 @@ void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT ou
#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
#endif

int opus_limit2_checkwithin1_c(float *samples, int cnt);

#ifndef OVERRIDE_LIMIT2_CHECKWITHIN1
#define opus_limit2_checkwithin1(samples, cnt, arch) ((void)(arch), opus_limit2_checkwithin1_c(samples, cnt))
#endif

#endif /* DISABLE_FLOAT_API */

#endif /* MATHOPS_H */
72 changes: 72 additions & 0 deletions celt/tests/test_unit_mathops.c
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,77 @@ void testcelt_float2int16(int use_ref_impl, int buffer_size)
#undef MAX_BUFFER_SIZE
}

void testopus_limit2_checkwithin1(int use_ref_impl)
{
#define BUFFER_SIZE 37 /* strange float count to trigger residue loop of SIMD implementation */
#define BYTE_COUNT (BUFFER_SIZE * sizeof(float))
int i, within1;
const int arch = opus_select_arch();

float pattern[BUFFER_SIZE], buffer[BUFFER_SIZE];

for (i = 0; i < BUFFER_SIZE; ++i)
{
pattern[i] = i % 2 ? -1.f : 1.f;
}

/* All values within -1..1:
Nothing changed. Return value is implementation-dependent (not expected to recognise nothing exceeds -1..1) */
memcpy(buffer, pattern, BYTE_COUNT);
within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);
if (memcmp(buffer, pattern, BYTE_COUNT) != 0)
{
fprintf (stderr, "opus_limit2_checkwithin1() modified values not exceeding -1..1 (ref=%d)\n", use_ref_impl);
ret = 1;
}

/* One value exceeds -1..1, within -2..2:
Values unchanged. Return value says not all values are within -1..1 */
for (i = 0; i < BUFFER_SIZE; ++i)
{
const float replace_value = pattern[i] * 1.001f;

memcpy(buffer, pattern, BYTE_COUNT);
buffer[i] = replace_value;
within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);
if (within1 || buffer[i] != replace_value)
{
fprintf (stderr, "opus_limit2_checkwithin1() handled value exceeding -1..1 erroneously (ref=%d, i=%d)\n", use_ref_impl, i);
ret = 1;
}
buffer[i] = pattern[i];
if (memcmp(buffer, pattern, BYTE_COUNT) != 0)
{
fprintf (stderr, "opus_limit2_checkwithin1() modified value within -2..2 (ref=%d, i=%d)\n", use_ref_impl, i);
ret = 1;
}
}

/* One value exceeds -2..2:
One value is hardclipped, others are unchanged. Return value says not all values are within -1..1 */
for (i = 0; i < BUFFER_SIZE; ++i)
{
const float replace_value = pattern[i] * 2.1;

memcpy(buffer, pattern, BYTE_COUNT);
buffer[i] = replace_value;
within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);
if (within1 || buffer[i] != (replace_value > 0.f ? 2.f : -2.f))
{
fprintf (stderr, "opus_limit2_checkwithin1() handled value exceeding -2..2 erroneously (ref=%d, i=%d)\n", use_ref_impl, i);
ret = 1;
}
buffer[i] = pattern[i];
if (memcmp(buffer, pattern, BYTE_COUNT) != 0)
{
fprintf (stderr, "opus_limit2_checkwithin1() modified value within -2..2 (ref=%d, i=%d)\n", use_ref_impl, i);
ret = 1;
}
}
#undef BUFFER_SIZE
#undef BYTE_COUNT
}

#endif

int main(void)
Expand All @@ -461,6 +532,7 @@ int main(void)
testcelt_float2int16(use_ref_impl[i], 32);
testcelt_float2int16(use_ref_impl[i], 127);
testcelt_float2int16(use_ref_impl[i], 1031);
testopus_limit2_checkwithin1(use_ref_impl[i]);
}
#endif
return ret;
Expand Down
50 changes: 40 additions & 10 deletions src/opus.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* Copyright (c) 2011 Xiph.Org Foundation, Skype Limited
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin and Koen Vos */
/*
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -30,23 +31,40 @@
#endif

#include "opus.h"
#include "celt/mathops.h"
#include "opus_private.h"

#ifndef DISABLE_FLOAT_API
OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)

void opus_pcm_soft_clip_impl(float *_x, int N, int C, float *declip_mem, int arch)
{
int c;
int i;
float *x;
int all_within_neg1pos1;

if (C<1 || N<1 || !_x || !declip_mem) return;

/* First thing: saturate everything to +/- 2 which is the highest level our
non-linearity can handle. At the point where the signal reaches +/-2,
the derivative will be zero anyway, so this doesn't introduce any
discontinuity in the derivative. */
for (i=0;i<N*C;i++)
_x[i] = MAX16(-2.f, MIN16(2.f, _x[i]));
/* Clamp everything within the range [-2, +2] which is the domain of the soft
clipping non-linearity. Outside the defined range the derivative will be zero,
therefore there is no discontinuity introduced here. The implementation
might provide a hint if all input samples are within the [-1, +1] range.

`opus_limit2_checkwithin1()`:
- Clamps all samples within the valid range [-2, +2].
- Generic C implementation:
* Does not attempt early detection whether samples are within hinted range.
* Always returns 0.
- Architecture specific implementation:
* Uses SIMD instructions to efficiently detect if all samples are
within the hinted range [-1, +1].
* Returns 1 if no samples exceed the hinted range, 0 otherwise.

`all_within_neg1pos1`:
- Optimization hint to skip per-sample out-of-bound checks.
If true, the check can be skipped. */
all_within_neg1pos1 = opus_limit2_checkwithin1(_x, N*C, arch);

for (c=0;c<C;c++)
{
float a;
Expand All @@ -72,10 +90,16 @@ OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)
float maxval;
int special=0;
int peak_pos;
for (i=curr;i<N;i++)
/* Detection for early exit can be skipped if hinted by `all_within_neg1pos1` */
if (all_within_neg1pos1)
{
if (x[i*C]>1 || x[i*C]<-1)
break;
i = N;
} else {
for (i=curr;i<N;i++)
{
if (x[i*C]>1 || x[i*C]<-1)
break;
}
}
if (i==N)
{
Expand Down Expand Up @@ -135,6 +159,12 @@ OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)
declip_mem[c] = a;
}
}

OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)
{
opus_pcm_soft_clip_impl(_x, N, C, declip_mem, 0);
}

#endif

int encode_size(int size, unsigned char *data)
Expand Down
2 changes: 1 addition & 1 deletion src/opus_decoder.c
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,7 @@ int opus_decode_native(OpusDecoder *st, const unsigned char *data,
OPUS_PRINT_INT(nb_samples);
#ifndef FIXED_POINT
if (soft_clip)
opus_pcm_soft_clip(pcm, nb_samples, st->channels, st->softclip_mem);
opus_pcm_soft_clip_impl(pcm, nb_samples, st->channels, st->softclip_mem, st->arch);
else
st->softclip_mem[0]=st->softclip_mem[1]=0;
#endif
Expand Down
2 changes: 2 additions & 0 deletions src/opus_private.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int
void downmix_int24(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);
int is_digital_silence(const opus_res* pcm, int frame_size, int channels, int lsb_depth);

void opus_pcm_soft_clip_impl(float *_x, int N, int C, float *declip_mem, int arch);

int encode_size(int size, unsigned char *data);

opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs);
Expand Down