From 16c62b84e68fd58cc92c167af6875a49a74f9f49 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 26 Mar 2018 03:27:09 +0200 Subject: [PATCH] Slightly better neon code. --- mandel_neon.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mandel_neon.c b/mandel_neon.c index c6f0dca..970dc2f 100644 --- a/mandel_neon.c +++ b/mandel_neon.c @@ -1,6 +1,13 @@ #include #include "mandel.h" +static inline int is_zero(uint32x4_t mask) { + uint64x2_t v64 = vreinterpretq_u64_u32(mask); + uint32x2_t v32 = vqmovn_u64(v64); + uint64x1_t result = vreinterpret_u64_u32(v32); + return result[0] == 0; +} + void mandel_neon(unsigned char *image, const struct spec *s) { @@ -12,9 +19,7 @@ mandel_neon(unsigned char *image, const struct spec *s) float32x4_t one = vdupq_n_f32(1); float32x4_t iter_scale = vdupq_n_f32(1.0f / s->iterations); float32x4_t depth_scale = vdupq_n_f32(s->depth - 1); - float32x4_t c0123; // {0.0f, 1.0f, 2.0f, 3.0f} - for (int i = 0; i < 4; i++) - c0123 = vsetq_lane_f32(i, c0123, i); + float32x4_t c0123 = {0.0f, 1.0f, 2.0f, 3.0f}; #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { @@ -44,10 +49,7 @@ mandel_neon(unsigned char *image, const struct spec *s) uint32x4_t mask = vcltq_f32(mag2, threshold); /* Early bailout? */ - if (vgetq_lane_u32(mask, 0) == 0 && - vgetq_lane_u32(mask, 1) == 0 && - vgetq_lane_u32(mask, 2) == 0 && - vgetq_lane_u32(mask, 3) == 0) + if (is_zero(mask)) break; /* Increment k */