From 6ec11b3102b1c0538749273ee920912df2b4ce46 Mon Sep 17 00:00:00 2001 From: Michael Wallner Date: Wed, 13 Jul 2022 15:17:19 +0200 Subject: [PATCH 1/7] add __ARM_NEON support --- .gitignore | 1 + CMakeLists.txt | 15 ++- src/lepton/idct.cc | 216 ++++++++++++++++++++++++++++++-- src/lepton/jpgcoder.cc | 13 +- src/lepton/recoder.cc | 24 +++- src/lepton/vp8_encoder.cc | 22 +++- src/vp8/model/model.cc | 45 ++++++- src/vp8/model/model.hh | 221 ++++++++++++++++++++++++++++++++- src/vp8/model/numeric.hh | 10 +- src/vp8/util/block_context.hh | 56 ++++++++- src/vp8/util/generic_worker.cc | 14 ++- src/vp8/util/memory.cc | 12 +- test_suite/test_trunc.sh | 2 +- 13 files changed, 594 insertions(+), 57 deletions(-) diff --git a/.gitignore b/.gitignore index e4489f83..e18fed3d 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,4 @@ Makefile.in /test_custom_table.sh.log /test_custom_table.sh.trs .dirstamp +/cmake-build*/ diff --git a/CMakeLists.txt b/CMakeLists.txt index b3c51f86..49f53ba9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,8 +22,7 @@ if(ENABLE_ANS_EXPERIMENTAL) set(ANS_FLAGS "-DENABLE_ANS_EXPERIMENTAL") endif() - -if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc") +if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|arm|aarch") option(SSE_VECTORIZATION "SSE instructions" OFF) else() option(SSE_VECTORIZATION "SSE instructions" ON) @@ -278,8 +277,8 @@ set(LEPTON_SOURCES src/io/MemMgrAllocator.cc src/io/MemMgrAllocator.hh ) -if(SSE_VECTORIZATION) add_executable(lepton ${LEPTON_SOURCES}) +if(SSE_VECTORIZATION) add_executable(lepton-slow-best-ratio ${LEPTON_SOURCES}) add_executable(lepton-avx ${LEPTON_SOURCES}) endif() @@ -371,15 +370,15 @@ if(USE_SYSTEM_DEPENDENCIES) include_directories(${ZLIB_INCLUDE_DIRS}) find_package(OpenSSL) include_directories(${OPENSSL_INCLUDE_DIRS}) + target_link_libraries(lepton localbrotli ${OPENSSL_LIBRARIES} ${ZLIB_LIBRARIES} ${ADDITIONAL_FLAGS}) if(SSE_VECTORIZATION) - target_link_libraries(lepton localbrotli ${OPENSSL_LIBRARIES} ${ZLIB_LIBRARIES} ${ADDITIONAL_FLAGS}) target_link_libraries(lepton-slow-best-ratio localbrotli ${OPENSSL_LIBRARIES} ${ZLIB_LIBRARIES} ${ADDITIONAL_FLAGS}) target_link_libraries(lepton-avx localbrotli ${OPENSSL_LIBRARIES} ${ZLIB_LIBRARIES} ${ADDITIONAL_FLAGS}) endif() target_link_libraries(lepton-scalar localbrotli ${OPENSSL_LIBRARIES} ${ZLIB_LIBRARIES} ${ADDITIONAL_FLAGS}) else() + target_link_libraries(lepton localzlib localbrotli localmd5 ${ADDITIONAL_FLAGS}) if(SSE_VECTORIZATION) - target_link_libraries(lepton localzlib localbrotli localmd5 ${ADDITIONAL_FLAGS}) target_link_libraries(lepton-slow-best-ratio localzlib localbrotli localmd5 ${ADDITIONAL_FLAGS}) target_link_libraries(lepton-avx localzlib localbrotli localmd5 ${ADDITIONAL_FLAGS}) endif() @@ -392,8 +391,8 @@ else() endif() set_target_properties(localzlib PROPERTIES COMPILE_FLAGS "${VECTOR_FLAGS} ${ZLIB_EXTRA_INCLUDE_DIRS} ${ADDITIONAL_COMPILE_FLAGS} ${ADDITIONAL_DEFINES}") endif() -if(SSE_VECTORIZATION) set_target_properties(lepton PROPERTIES COMPILE_FLAGS "${VECTOR_FLAGS} ${ADDITIONAL_COMPILE_FLAGS} ${ADDITIONAL_DEFINES} ${ALLOCATOR_FLAGS} ${ANS_FLAGS} ${BILLING_FLAGS}") +if(SSE_VECTORIZATION) set_target_properties(lepton-slow-best-ratio PROPERTIES COMPILE_FLAGS "${VECTOR_FLAGS} ${ADDITIONAL_COMPILE_FLAGS} ${ADDITIONAL_DEFINES} ${ALLOCATOR_FLAGS} ${ANS_FLAGS} ${BILLING_FLAGS} -DDEFAULT_SINGLE_THREAD") set_target_properties(lepton-avx PROPERTIES COMPILE_FLAGS "${ARCH_AVX2_FLAGS} ${ADDITIONAL_COMPILE_FLAGS} ${ADDITIONAL_DEFINES} ${ALLOCATOR_FLAGS} ${ANS_FLAGS} ${BILLING_FLAGS}") endif() @@ -463,8 +462,8 @@ add_custom_target( ) file(GLOB JS_FILES "src/js/*") file(COPY ${JS_FILES} DESTINATION ${CMAKE_BINARY_DIR}) -if(SSE_VECTORIZATION) add_dependencies(lepton version) +if(SSE_VECTORIZATION) add_dependencies(lepton-avx version) add_dependencies(lepton-slow-best-ratio version) endif() @@ -472,5 +471,5 @@ add_dependencies(lepton-scalar version) if(SSE_VECTORIZATION) install (TARGETS lepton lepton-slow-best-ratio lepton-avx lepton-scalar DESTINATION bin) else() -install (TARGETS lepton-scalar DESTINATION bin) +install (TARGETS lepton lepton-scalar DESTINATION bin) endif() diff --git a/src/lepton/idct.cc b/src/lepton/idct.cc index 4d187d3b..041e4d7d 100644 --- a/src/lepton/idct.cc +++ b/src/lepton/idct.cc @@ -1,12 +1,13 @@ /* -*-mode:c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ -#ifdef __aarch64__ -#define USE_SCALAR 1 -#endif #ifndef USE_SCALAR +# if __ARM_NEON +#include +# else #include #include #include "../vp8/util/mm_mullo_epi32.hh" +# endif #endif #include "../vp8/util/aligned_block.hh" @@ -31,7 +32,7 @@ enum { }; } -#if ((!defined(__SSE2__)) && !(_M_IX86_FP >= 1)) || defined(USE_SCALAR) +#if ((!__ARM_NEON) && ((!defined(__SSE2__)) && !(_M_IX86_FP >= 1))) || defined(USE_SCALAR) static void idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], bool ignore_dc) { int32_t intermed[64]; @@ -159,11 +160,206 @@ idct_scalar(const AlignedBlock &block, const uint16_t q[64], int16_t outp[64], b //outp[i]>>=3; } } +#elif __ARM_NEON + +template +int32x4_t vget_raster(const AlignedBlock &block) { + int32_t a[] = { + block.coefficients_raster(which_vec + 0 * stride + offset), + block.coefficients_raster(which_vec + 1 * stride + offset), + block.coefficients_raster(which_vec + 2 * stride + offset), + block.coefficients_raster(which_vec + 3 * stride + offset), + }; + return vld1q_s32(a); +} +template +int32x4_t vquantize(int which_vec, int32x4_t vec, const uint16_t q[64]) { + int32_t a[] = { + q[which_vec + 0 * stride + offset], + q[which_vec + 1 * stride + offset], + q[which_vec + 2 * stride + offset], + q[which_vec + 3 * stride + offset], + }; + return vmulq_s32(vec, vld1q_s32(a)); +} + +#define TRANSPOSE_128i(row0, row1, row2, row3, ocol0, ocol1, ocol2, ocol3) \ + do { \ + int64x2_t intermed0 = vreinterpretq_s64_s32(vzip1q_s32(row0, row1)); \ + int64x2_t intermed1 = vreinterpretq_s64_s32(vzip1q_s32(row2, row3)); \ + int64x2_t intermed2 = vreinterpretq_s64_s32(vzip2q_s32(row0, row1)); \ + int64x2_t intermed3 = vreinterpretq_s64_s32(vzip2q_s32(row2, row3)); \ + ocol0 = vreinterpretq_s32_s64(vzip1q_s64(intermed0, intermed1)); \ + ocol1 = vreinterpretq_s32_s64(vzip2q_s64(intermed0, intermed1)); \ + ocol2 = vreinterpretq_s32_s64(vzip1q_s64(intermed2, intermed3)); \ + ocol3 = vreinterpretq_s32_s64(vzip2q_s64(intermed2, intermed3)); \ + }while(0) + + +void idct_neon(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { + char vintermed_storage[64 * sizeof(int32_t) + 16]; + // align intermediate storage to 16 bytes + int32_t *vintermed = (int32_t*) (vintermed_storage + 16 - ((vintermed_storage - (char*)nullptr) &0xf)); + using namespace idct_local; + // Horizontal 1-D IDCT. + for (int yvec = 0; yvec < 64; yvec += 32) { + int32x4_t tmp, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, xv8; + if (yvec == 0) { + xv0 = vget_raster<0, 0, 8>(block); + xv1 = vget_raster<0, 4, 8>(block); + xv2 = vget_raster<0, 6, 8>(block); + xv3 = vget_raster<0, 2, 8>(block); + xv4 = vget_raster<0, 1, 8>(block); + xv5 = vget_raster<0, 7, 8>(block); + xv6 = vget_raster<0, 5, 8>(block); + xv7 = vget_raster<0, 3, 8>(block); + if (__builtin_expect(ignore_dc, true)) { + xv0 = vsetq_lane_s32(0, xv0, 0); + } + } else { + xv0 = vget_raster<32, 0, 8>(block); + xv1 = vget_raster<32, 4, 8>(block); + xv2 = vget_raster<32, 6, 8>(block); + xv3 = vget_raster<32, 2, 8>(block); + xv4 = vget_raster<32, 1, 8>(block); + xv5 = vget_raster<32, 7, 8>(block); + xv6 = vget_raster<32, 5, 8>(block); + xv7 = vget_raster<32, 3, 8>(block); + } + + tmp = vquantize<0, 8>(yvec, xv0, q); + xv0 = vaddq_s32(vshlq_n_s32(tmp, 11), vmovq_n_s32(128)); + + tmp = vquantize<4, 8>(yvec, xv1, q); + xv1 = vshlq_n_s32(tmp, 11); + + xv2 = vquantize<6, 8>(yvec, xv2, q); + xv3 = vquantize<2, 8>(yvec, xv3, q); + xv4 = vquantize<1, 8>(yvec, xv4, q); + xv5 = vquantize<7, 8>(yvec, xv5, q); + xv6 = vquantize<5, 8>(yvec, xv6, q); + xv7 = vquantize<3, 8>(yvec, xv7, q); + + // Stage 1. + xv8 = vmulq_s32(vmovq_n_s32(w7), vaddq_s32(xv4, xv5)); + xv4 = vaddq_s32(xv8, vmulq_s32(vmovq_n_s32(w1mw7), xv4)); + xv5 = vsubq_s32(xv8, vmulq_s32(vmovq_n_s32(w1pw7), xv5)); + + xv8 = vmulq_s32(vmovq_n_s32(w3), vaddq_s32(xv6, xv7)); + xv6 = vsubq_s32(xv8, vmulq_s32(vmovq_n_s32(w3mw5), xv6)); + xv7 = vsubq_s32(xv8, vmulq_s32(vmovq_n_s32(w3pw5), xv7)); + + xv8 = vaddq_s32(xv0, xv1); + xv0 = vsubq_s32(xv0, xv1); + xv1 = vmulq_s32(vmovq_n_s32(w6), vaddq_s32(xv3, xv2)); + xv2 = vsubq_s32(xv1, vmulq_s32(vmovq_n_s32(w2pw6), xv2)); + xv3 = vaddq_s32(xv1, vmulq_s32(vmovq_n_s32(w2mw6), xv3)); + xv1 = vaddq_s32(xv4, xv6); + xv4 = vsubq_s32(xv4, xv6); + xv6 = vaddq_s32(xv5, xv7); + xv5 = vsubq_s32(xv5, xv7); + + // Stage 3. + xv7 = vaddq_s32(xv8, xv3); + xv8 = vsubq_s32(xv8, xv3); + xv3 = vaddq_s32(xv0, xv2); + xv0 = vsubq_s32(xv0, xv2); + xv2 = vshrq_n_s32(vaddq_s32(vmulq_s32(vmovq_n_s32(r2), + vaddq_s32(xv4, xv5)), + vmovq_n_s32(128)), 8); + xv4 = vshrq_n_s32(vaddq_s32(vmulq_s32(vmovq_n_s32(r2), + vsubq_s32(xv4, xv5)), + vmovq_n_s32(128)), 8); + // Stage 4. + int index = 0; + for (int32x4_t row0 = vshrq_n_s32(vaddq_s32(xv7, xv1), 8), + row1 = vshrq_n_s32(vaddq_s32(xv3, xv2), 8), + row2 = vshrq_n_s32(vaddq_s32(xv0, xv4), 8), + row3 = vshrq_n_s32(vaddq_s32(xv8, xv6), 8); + true; // will break if index == 4 at the end of this loop + index += 4, + row0 = vshrq_n_s32(vsubq_s32(xv8, xv6), 8), + row1 = vshrq_n_s32(vsubq_s32(xv0, xv4), 8), + row2 = vshrq_n_s32(vsubq_s32(xv3, xv2), 8), + row3 = vshrq_n_s32(vsubq_s32(xv7, xv1), 8)) { + int32x4_t col0, col1, col2, col3; + TRANSPOSE_128i(row0, row1, row2, row3, col0, col1, col2, col3); + + vst1q_s32(vintermed + index + 0 + yvec, col0); + vst1q_s32(vintermed + index + 8 + yvec, col1); + vst1q_s32(vintermed + index + 16 + yvec, col2); + vst1q_s32(vintermed + index + 24 + yvec, col3); + if (index == 4) { + break; // only iterate twice + } + } + } + // Vertical 1-D IDCT. + for (uint8_t xvec = 0; xvec < 8; xvec += 4) { + int32x4_t yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, yv8; + yv0 = vaddq_s32(vshlq_n_s32(vld1q_s32(vintermed + xvec), 8), + vmovq_n_s32(8192)); + yv1 = vshlq_n_s32(vld1q_s32(vintermed + 8 * 4 + xvec), 8); + yv2 = vld1q_s32(vintermed + 8 * 6 + xvec); + yv3 = vld1q_s32(vintermed + 8 * 2 + xvec); + yv4 = vld1q_s32(vintermed + 8 * 1 + xvec); + yv5 = vld1q_s32(vintermed + 8 * 7 + xvec); + yv6 = vld1q_s32(vintermed + 8 * 5 + xvec); + yv7 = vld1q_s32(vintermed + 8 * 3 + xvec); + + // Stage 1. + yv8 = vaddq_s32(vmulq_s32(vaddq_s32(yv4, yv5), vmovq_n_s32(w7)), vmovq_n_s32(4)); + yv4 = vshrq_n_s32(vaddq_s32(yv8, vmulq_s32(vmovq_n_s32(w1mw7), yv4)), 3); + yv5 = vshrq_n_s32(vsubq_s32(yv8, vmulq_s32(vmovq_n_s32(w1pw7), yv5)), 3); + yv8 = vaddq_s32(vmulq_s32(vmovq_n_s32(w3), vaddq_s32(yv6, yv7)), vmovq_n_s32(4)); + yv6 = vshrq_n_s32(vsubq_s32(yv8, vmulq_s32(vmovq_n_s32(w3mw5), yv6)), 3); + yv7 = vshrq_n_s32(vsubq_s32(yv8, vmulq_s32(vmovq_n_s32(w3pw5), yv7)), 3); + // Stage 2. + yv8 = vaddq_s32(yv0, yv1); + yv0 = vsubq_s32(yv0, yv1); + yv1 = vaddq_s32(vmulq_s32(vmovq_n_s32(w6), vaddq_s32(yv3, yv2)), vmovq_n_s32(4)); + yv2 = vshrq_n_s32(vsubq_s32(yv1, vmulq_s32(vmovq_n_s32(w2pw6), yv2)), 3); + yv3 = vshrq_n_s32(vaddq_s32(yv1, vmulq_s32(vmovq_n_s32(w2mw6), yv3)), 3); + yv1 = vaddq_s32(yv4, yv6); + yv4 = vsubq_s32(yv4, yv6); + yv6 = vaddq_s32(yv5, yv7); + yv5 = vsubq_s32(yv5, yv7); + + // Stage 3. + yv7 = vaddq_s32(yv8, yv3); + yv8 = vsubq_s32(yv8, yv3); + yv3 = vaddq_s32(yv0, yv2); + yv0 = vsubq_s32(yv0, yv2); + yv2 = vshrq_n_s32(vaddq_s32(vmulq_s32(vmovq_n_s32(r2), + vaddq_s32(yv4, yv5)), + vmovq_n_s32(128)), 8); + yv4 = vshrq_n_s32(vaddq_s32(vmulq_s32(vmovq_n_s32(r2), + vsubq_s32(yv4, yv5)), + vmovq_n_s32(128)), 8); + int32x4_t row0 = vshrq_n_s32(vaddq_s32(yv7, yv1), 11); + int32x4_t row1 = vshrq_n_s32(vaddq_s32(yv3, yv2), 11); + int32x4_t row2 = vshrq_n_s32(vaddq_s32(yv0, yv4), 11); + int32x4_t row3 = vshrq_n_s32(vaddq_s32(yv8, yv6), 11); + int32x4_t row4 = vshrq_n_s32(vsubq_s32(yv8, yv6), 11); + int32x4_t row5 = vshrq_n_s32(vsubq_s32(yv0, yv4), 11); + int32x4_t row6 = vshrq_n_s32(vsubq_s32(yv3, yv2), 11); + int32x4_t row7 = vshrq_n_s32(vsubq_s32(yv7, yv1), 11); + + vst1_s16(voutp + 0 * 8 + xvec, vmovn_s32(row0)); + vst1_s16(voutp + 1 * 8 + xvec, vmovn_s32(row1)); + vst1_s16(voutp + 2 * 8 + xvec, vmovn_s32(row2)); + vst1_s16(voutp + 3 * 8 + xvec, vmovn_s32(row3)); + vst1_s16(voutp + 4 * 8 + xvec, vmovn_s32(row4)); + vst1_s16(voutp + 5 * 8 + xvec, vmovn_s32(row5)); + vst1_s16(voutp + 6 * 8 + xvec, vmovn_s32(row6)); + vst1_s16(voutp + 7 * 8 + xvec, vmovn_s32(row7)); + }} + #else /* At least SSE2 is available { */ template __m128i vget_raster(const AlignedBlock&block) { - return _mm_set_epi32(block.coefficients_raster(which_vec + 3 * stride + offset), block.coefficients_raster(which_vec + 2 * stride + offset), + return _mm_set_epi32(block.coefficients_raster(which_vec + 3 * stride + offset), block.coefficients_raster(which_vec + 1 * stride + offset), block.coefficients_raster(which_vec + offset)); } @@ -612,15 +808,13 @@ void idct(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64], bool ignore_dc) { #ifdef USE_SCALAR idct_scalar(block, q, voutp, ignore_dc); -#else -#ifdef __AVX2__ +#elif __ARM_NEON + idct_neon(block, q, voutp, ignore_dc); +#elif defined(__AVX2__) idct_avx(block, q, voutp, ignore_dc); -#else -#if defined(__SSE2__) || (_M_IX86_FP >= 1) +#elif defined(__SSE2__) || (_M_IX86_FP >= 1) idct_sse(block, q, voutp, ignore_dc); #else idct_scalar(block, q, voutp, ignore_dc); #endif -#endif -#endif } diff --git a/src/lepton/jpgcoder.cc b/src/lepton/jpgcoder.cc index cce05863..5623bd8f 100644 --- a/src/lepton/jpgcoder.cc +++ b/src/lepton/jpgcoder.cc @@ -58,13 +58,13 @@ volatile int volatile1024 = 1024; #endif -#ifdef __aarch64__ -#define USE_SCALAR 1 -#endif - #ifndef USE_SCALAR +# if __ARM_NEON +#include +# else #include #include +# endif #endif #include "jpgcoder.hh" @@ -2476,6 +2476,11 @@ enum MergeJpegStreamingStatus{ bool aligned_memchr16ff(const unsigned char *local_huff_data) { #if USE_SCALAR return memchr(local_huff_data, 0xff, 16) != NULL; +#elif __ARM_NEON + uint8x16_t buf = vld1q_u8(local_huff_data), + res = vceqq_u8(buf, vmovq_n_u8(~0)); + uint16_t val = vaddlvq_u8(res); + return val; #else __m128i buf = _mm_load_si128((__m128i const*)local_huff_data); __m128i ff = _mm_set1_epi8(-1); diff --git a/src/lepton/recoder.cc b/src/lepton/recoder.cc index 27726fd0..32b95a6a 100644 --- a/src/lepton/recoder.cc +++ b/src/lepton/recoder.cc @@ -60,7 +60,23 @@ int find_aligned_end_64_scalar(const int16_t *block) { return end; } -#if defined(__AVX2__) && !defined(USE_SCALAR) +#if __ARM_NEON && !defined(USE_SCALAR) +int find_aligned_end_64_neon(const int16_t *p) { + int l = 0; + int16_t va[] = {0, 1, 2, 3, 4, 5, 6, 7}; + int16x8_t vn = vld1q_s16(va); + + for (int i = 0; i < 8; ++i, vn = vaddq_s16(vn, vmovq_n_s16(8))) { + int16x8_t buf = vld1q_s16(p + i * 8); + int16x8_t zro = vreinterpretq_s16_u16(vtstq_s16(buf, buf)); + int16_t val = vmaxvq_s16(vandq_s16(vn, zro)); + if (val) { + l = val; + } + } + return l; +} +#elif defined(__AVX2__) && !defined(USE_SCALAR) int find_aligned_end_64_avx2(const int16_t *block) { uint32_t mask = 0; int iter; @@ -110,8 +126,10 @@ int find_aligned_end_64_sse42(const int16_t *block) { #endif int find_aligned_end_64(const int16_t *block) { -#if defined(USE_SCALAR) +#ifdef USE_SCALAR return find_aligned_end_64_scalar(block); +#elif __ARM_NEON + return find_aligned_end_64_neon(block); #elif defined(__AVX2__) return find_aligned_end_64_avx2(block); #elif defined(__SSE_4_2) @@ -124,6 +142,8 @@ int find_aligned_end_64(const int16_t *block) { static bool aligned_memchr16ff(const unsigned char *local_huff_data) { #ifdef USE_SCALAR return memchr(local_huff_data, 0xff, 16) != NULL; +#elif __ARM_NEON + return !!vaddlvq_u8(vceqq_u8(vld1q_u8(local_huff_data), vmovq_n_u8(~0))); #else __m128i buf = _mm_load_si128((__m128i const*)local_huff_data); __m128i ff = _mm_set1_epi8(-1); diff --git a/src/lepton/vp8_encoder.cc b/src/lepton/vp8_encoder.cc index cbac1863..de3f9bef 100644 --- a/src/lepton/vp8_encoder.cc +++ b/src/lepton/vp8_encoder.cc @@ -162,7 +162,27 @@ uint32_t aligned_block_cost_scalar(const AlignedBlock &block) { } uint32_t aligned_block_cost(const AlignedBlock &block) { -#if defined(__SSE2__) && !defined(USE_SCALAR) /* SSE2 or higher instruction set available { */ +#if __ARM_NEON && !defined(USE_SCALAR) + int16x8_t zero = vmovq_n_s16(0); + int16x8_t cost; + for (int i = 0; i < 64; i += 8) { + int16x8_t vrow; + + cost = vmovq_n_s16(0); + vrow = vabsq_s16(vld1q_s16(block.raw_data() + i)); + while (vmaxvq_s16(vrow)) { + int16x8_t mask = vreinterpretq_s16_u16(vcgtq_s16(vrow, zero)); + cost = vaddq_s16(cost, vandq_s16(mask, vmovq_n_s16(2))); + vrow = vshrq_n_s16(vrow, 1); + } + cost = vaddq_s16(cost, vshrq_n_s16(cost, 8)); + cost = vaddq_s16(cost, vshrq_n_s16(cost, 4)); + cost = vaddq_s16(cost, vshrq_n_s16(cost, 2)); + } + uint16_t rslt = 16 + vgetq_lane_s16(cost, 0); + //dprintf(3, "cost=%d, should=%d\n", rslt, aligned_block_cost_scalar(block)); + return rslt; +#elif defined(__SSE2__) && !defined(USE_SCALAR) /* SSE2 or higher instruction set available { */ const __m128i zero = _mm_setzero_si128(); __m128i v_cost; for (int i = 0; i < 64; i+= 8) { diff --git a/src/vp8/model/model.cc b/src/vp8/model/model.cc index 2ddb8d10..e5fb191c 100644 --- a/src/vp8/model/model.cc +++ b/src/vp8/model/model.cc @@ -8,15 +8,16 @@ #include #include -#ifdef __aarch64__ -#define USE_SCALAR 1 -#endif - #ifndef USE_SCALAR -#include +# if __ARM_NEON +# include +# else +# include +# endif #endif #include "model.hh" + bool all_branches_identity(const Branch * start, const Branch * end) { for (const Branch * i = start;i != end; ++i) { if (!i->is_identity()){ @@ -99,6 +100,40 @@ void set_branch_range_identity(Branch * start, Branch * end) { _mm_store_si128(write_cursor + 2, r2); write_cursor += 3; } +#elif __ARM_NEON && !defined(USE_SCALAR) + for (int i = 0;i < 32; ++i) { + start[i].set_identity(); + } + for (int i = 1; i <= 32; ++i) { + end[-i].set_identity(); + } + char * data = (char *)(void*)start; + uint64x1x4_t r0 = vld4_u64((const uint64_t*)data); + uint64x1x4_t r1 = vld4_u64((const uint64_t*)(data + 32)); + uint64x1x4_t r2 = vld4_u64((const uint64_t*)(data + 64)); + size_t offset = data - (char*)0; + size_t align = 32 - (offset % 32); + char * dataend = (char*)end; + size_t offsetend = dataend - (char*)0; + uint64_t *write_end = (uint64_t *) (dataend - (offsetend % 32)); + uint64_t *write_cursor = (uint64_t *) (data + align); + switch(align % 3) { + case 2: + vst4_u64(write_cursor, r1); + write_cursor += 4; + case 1: + vst4_u64(write_cursor, r2); + write_cursor += 4; + case 0: + break; + } + while(write_cursor + 2 < write_end) { + vst4_u64(write_cursor + 0, r0); + vst4_u64(write_cursor + 1, r1); + vst4_u64(write_cursor + 2, r2); + write_cursor += 3; + } + #else for (;start != end; ++start) { start->set_identity(); diff --git a/src/vp8/model/model.hh b/src/vp8/model/model.hh index c6481701..67433afc 100644 --- a/src/vp8/model/model.hh +++ b/src/vp8/model/model.hh @@ -13,13 +13,13 @@ #include "../util/aligned_block.hh" #include "../util/block_based_image.hh" -#ifdef __aarch64__ -#define USE_SCALAR 1 -#endif - #ifndef USE_SCALAR -#include -#include "../util/mm_mullo_epi32.hh" +# if __ARM_NEON +# include +# else +# include +# include "../util/mm_mullo_epi32.hh" +# endif #endif enum F_TYPE { @@ -670,7 +670,17 @@ public: if (retval > max_value) retval -= adjustment_factor; return retval; } +#if __ARM_NEON +#define shift_right_round_zero_epi16(vec, imm8) __extension__ ({ \ + int16x8_t sign = vreinterpretq_s16_u16(vcltzq_s16(vec)); \ + int16x8_t rslt = vshrq_n_s16(vabsq_s16(vec), imm8); \ + /* ((x^0xffff) - 0xffff == not(x)+1 */ \ + rslt = veorq_s16(rslt, sign); \ + vsubq_s16(rslt, sign); \ +}) +#else #define shift_right_round_zero_epi16(vec, imm8) (_mm_sign_epi16(_mm_srli_epi16(_mm_sign_epi16(vec, vec), imm8), vec)); +#endif int adv_predict_dc_pix(const ConstBlockContext&context, int16_t*pixels_sans_dc, int32_t *uncertainty_val, int32_t *uncertainty2_val) { uint16_t *q = ProbabilityTablesBase::quantization_table((int)color); idct(context.here(), q, pixels_sans_dc, true); @@ -680,6 +690,54 @@ public: int32_t avgmed = 0; if(all_present || left_present || above_present) { #ifndef USE_SCALAR +# if __ARM_NEON + if (all_present || above_present) { //above goes first to prime the cache + int16x8_t neighbor_above = vld1q_s16(context.neighbor_context_above_unchecked() + .horizontal_ptr()); + int16x8_t pixels_sans_dc_reg = vld1q_s16(pixels_sans_dc); + int16x8_t pixels2_sans_dc_reg = vld1q_s16(pixels_sans_dc + 8); + int16x8_t pixels_delta = vsubq_s16(pixels_sans_dc_reg, pixels2_sans_dc_reg); + int16x8_t pixels_delta_div2 = shift_right_round_zero_epi16(pixels_delta, 1); + int16x8_t pixels_sans_dc_recentered = vaddq_s16(pixels_sans_dc_reg, + vmovq_n_s16(1024)); + int16x8_t above_dc_estimate = vsubq_s16(vsubq_s16(neighbor_above, pixels_delta_div2), + pixels_sans_dc_recentered); + + vst1q_s16(dc_estimates.begin() + ((all_present || left_present) ? 8 : 0), above_dc_estimate); + } + if (all_present || left_present) { + const int16_t * horiz_data = context.neighbor_context_left_unchecked().vertical_ptr_except_7(); + int16x8_t neighbor_horiz = vld1q_s16(horiz_data); + int16_t pixels_sans_dc_1[] = { + pixels_sans_dc[0], + pixels_sans_dc[8], + pixels_sans_dc[16], + pixels_sans_dc[32], + pixels_sans_dc[40], + pixels_sans_dc[24], + pixels_sans_dc[48], + pixels_sans_dc[56], + }, pixels_sans_dc_2[] = { + pixels_sans_dc[1], + pixels_sans_dc[9], + pixels_sans_dc[17], + pixels_sans_dc[33], + pixels_sans_dc[41], + pixels_sans_dc[25], + pixels_sans_dc[49], + pixels_sans_dc[57], + }; + int16x8_t pixels_sans_dc_reg = vld1q_s16(pixels_sans_dc_1), + pixels_delta = vsubq_s16(pixels_sans_dc_reg, vld1q_s16(pixels_sans_dc_2)); + + int16x8_t pixels_delta_div2 = shift_right_round_zero_epi16(pixels_delta, 1); + int16x8_t left_dc_estimate = vsubq_s16(vsubq_s16(neighbor_horiz, pixels_delta_div2), + vaddq_s16(pixels_sans_dc_reg, + vmovq_n_s16(1024))); + + vst1q_s16(dc_estimates.begin(), left_dc_estimate); + } +# else if (all_present || above_present) { //above goes first to prime the cache __m128i neighbor_above = _mm_loadu_si128((const __m128i*)(const char*)context .neighbor_context_above_unchecked() @@ -727,6 +785,7 @@ public: _mm_store_si128((__m128i*)(char*)dc_estimates.begin(), left_dc_estimate); } +# endif #else if (all_present || left_present) { for (int i = 0; i < 8;++i) { @@ -870,6 +929,48 @@ public: //} } #if defined(OPTIMIZED_7x7) && !defined(USE_SCALAR) +# if __ARM_NEON + bool aavrg_vec_matches(int16x8_t &retval, unsigned int aligned_zz, ConstBlockContext context) { + int16_t ret[8], correct[8]; + vst1q_s16(ret, retval); + for (int i = 0; i < 8; ++i) { + if (ret[i] != compute_aavrg(aligned_to_raster.at(aligned_zz + i), aligned_zz + i, context)) { + return false; + } + } + return true; + } + void compute_aavrg_vec(unsigned int aligned_zz, ConstBlockContext context, short* aligned_retval) { + vst1q_s16(aligned_retval, compute_aavrg_vec(aligned_zz, context)); + } + int16x8_t compute_aavrg_vec(unsigned int aligned_zz, ConstBlockContext context) { + if (all_present == false && left_present == false && above_present == false) { + return vmovq_n_s16(0); + } + int16x8_t left = vmovq_n_s16(0); + if (all_present || left_present) { + left = vabsq_s16(vld1q_s16(&context.left_unchecked().coef.at(aligned_zz))); + if ((!all_present) && !above_present) { + return left; + } + } + int16x8_t above = vmovq_n_s16(0); + if (all_present || above_present) { + above = vabsq_s16(vld1q_s16(&context.above_unchecked().coef.at(aligned_zz))); + if (all_present == false && !left_present) { + return above; + } + } + constexpr unsigned int log_weight = 5; + int16x8_t total = vaddq_s16(left, above); + total = vmulq_n_s16(total, 13); // approximate (a*2+b*2 + c)/5 as (a *13 + b * 13 + c * 6)/32 + int16x8_t aboveleft = vabsq_s16(vld1q_s16(&context.above_left_unchecked().coef.at(aligned_zz))); + total = vaddq_s16(total, vmulq_n_s16(aboveleft, 6)); + int16x8_t retval = vshrq_n_s16(total, log_weight); + dev_assert(aavrg_vec_matches(retval, aligned_zz, context)); + return retval; + } +# else bool aavrg_vec_matches(__m128i retval, unsigned int aligned_zz, ConstBlockContext context) { short ret[8]; _mm_storeu_si128((__m128i*)(char*)ret, retval); @@ -922,9 +1023,116 @@ public: //total += abs(block.context().above_right.get()->coefficients().at(0)); //} } +# endif // !__ARM_NEON #endif #ifndef USE_SCALAR +# if __ARM_NEON + static inline int32x4_t vsignq_s32(int32x4_t v, int32x4_t m) { + // v[] * ((m[]<0 ? ~0:0) | ((m[]>0 ? ~0:0) & 1)) + return vmulq_s32(v, vreinterpretq_s32_u32(vorrq_u32(vcltzq_s32(m), vandq_u32(vcgtzq_s32(m), vmovq_n_u32(1))))); + } + + static int32_t compute_lak_vec(int32x4_t coeffs_x_low, int32x4_t coeffs_x_high, int32x4_t coeffs_a_low, int32x4_t indirect_coeffs_a_high, const int32_t *icos_deq) { + int32_t sign_mask_a[] = {1, -1, 1, -1}; // ((i & 1) ? -1 : 1) + int32x4_t sign_mask = vld1q_s32(sign_mask_a); + + //coeffs_x[i] = ((i & 1) ? -1 : 1) * coeffs_a[i] - coeffs_x[i]; + coeffs_a_low = vsignq_s32(coeffs_a_low, sign_mask); + int32x4_t coeffs_a_high = vsignq_s32(indirect_coeffs_a_high, sign_mask); + coeffs_x_low = vsubq_s32(coeffs_a_low, coeffs_x_low); + coeffs_x_high = vsubq_s32(coeffs_a_high, coeffs_x_high); + + int32x4_t icos_low = vld1q_s32(icos_deq); + int32x4_t icos_high = vld1q_s32(icos_deq + 4); + // coeffs_x[i] *= icos[i] + int32x4_t deq_low = vmulq_s32(coeffs_x_low, icos_low); + int32x4_t deq_high = vmulq_s32(coeffs_x_high, icos_high); + + int32_t prediction = vaddvq_s32(vaddq_s32(deq_low, deq_high)); + return prediction / icos_deq[0]; + } + +#define ITER(x_var, a_var, i, step) do { \ + int32_t xa[] = { \ + i == 0 ? 0 : context.here().coefficients_raster(band + step * (i)), \ + context.here().coefficients_raster(band + step * ((i) + 1)), \ + context.here().coefficients_raster(band + step * ((i) + 2)), \ + context.here().coefficients_raster(band + step * ((i) + 3)), \ + }; \ + x_var = vld1q_s32(xa); \ + int32_t aa[] = { \ + neighbor.coefficients_raster(band + step * (i)), \ + neighbor.coefficients_raster(band + step * ((i) + 1)), \ + neighbor.coefficients_raster(band + step * ((i) + 2)), \ + neighbor.coefficients_raster(band + step * ((i) + 3)), \ + }; \ + a_var = vld1q_s32(aa); \ + } while(0) + + template +#ifndef _WIN32 + __attribute__((always_inline)) +#endif + int32_t compute_lak_templ(const ConstBlockContext&context) { + int32x4_t coeffs_x_low; + int32x4_t coeffs_x_high; + int32x4_t coeffs_a_low; + int32x4_t coeffs_a_high; + const int32_t * icos = nullptr; + static_assert((band & 7) == 0 || (band >> 3) == 0, "This function only works on edges"); + if ((band >> 3) == 0) { + if(all_present == false && !above_present) { + return 0; + } + const auto &neighbor = context.above_unchecked(); + ITER(coeffs_x_low, coeffs_a_low, 0, 8); + ITER(coeffs_x_high, coeffs_a_high, 4, 8); + icos = ProbabilityTablesBase::icos_idct_edge_8192_dequantized_x((int)COLOR) + band * 8; + } else { + if (all_present == false && !left_present) { + return 0; + } + const auto &neighbor = context.left_unchecked(); + ITER(coeffs_x_low, coeffs_a_low, 0, 1); + ITER(coeffs_x_high, coeffs_a_high, 4, 1); + icos = ProbabilityTablesBase::icos_idct_edge_8192_dequantized_y((int)COLOR) + band; + } + return compute_lak_vec(coeffs_x_low, coeffs_x_high, coeffs_a_low, coeffs_a_high, icos); + } + int32_t compute_lak_horizontal(const ConstBlockContext&context, unsigned int band) { + if (all_present == false && !above_present) { + return 0; + } + int32x4_t coeffs_x_low; + int32x4_t coeffs_x_high; + int32x4_t coeffs_a_low; + int32x4_t coeffs_a_high; + dev_assert(band/8 == 0 && "this function only works for the top edge"); + const auto &neighbor = context.above_unchecked(); + ITER(coeffs_x_low, coeffs_a_low, 0, 8); + ITER(coeffs_x_high, coeffs_a_high, 4, 8); + const int32_t * icos = ProbabilityTablesBase::icos_idct_edge_8192_dequantized_x((int)COLOR) + band * 8; + return compute_lak_vec(coeffs_x_low, coeffs_x_high, coeffs_a_low, coeffs_a_high, icos); + } + int32_t compute_lak_vertical(const ConstBlockContext&context, unsigned int band) { + dev_assert((band & 7) == 0 && "Must be used for veritcal"); + if (all_present == false && !left_present) { + return 0; + } + int32x4_t coeffs_x_low; + int32x4_t coeffs_x_high; + int32x4_t coeffs_a_low; + int32x4_t coeffs_a_high; + const auto &neighbor = context.left_unchecked(); + ITER(coeffs_x_low, coeffs_a_low, 0, 1); + ITER(coeffs_x_high, coeffs_a_high, 4, 1); +#undef ITER + const int32_t *icos = ProbabilityTablesBase::icos_idct_edge_8192_dequantized_y((int)COLOR) + band; + return compute_lak_vec(coeffs_x_low, coeffs_x_high, coeffs_a_low, coeffs_a_high, + icos); + } +# else static int32_t compute_lak_vec(__m128i coeffs_x_low, __m128i coeffs_x_high, __m128i coeffs_a_low, __m128i #ifdef _WIN32 & @@ -1029,6 +1237,7 @@ public: return compute_lak_vec(coeffs_x_low, coeffs_x_high, coeffs_a_low, coeffs_a_high, icos); } +# endif // !__ARM_NEON #endif int32_t compute_lak(const ConstBlockContext&context, unsigned int band) { int coeffs_x[8]; diff --git a/src/vp8/model/numeric.hh b/src/vp8/model/numeric.hh index fe01ac1d..b6e5d9bd 100644 --- a/src/vp8/model/numeric.hh +++ b/src/vp8/model/numeric.hh @@ -10,14 +10,14 @@ #include #include "../util/memory.hh" -#ifdef __aarch64__ -#define USE_SCALAR 1 -#endif - #ifndef USE_SCALAR +# if __ARM_NEON +#include +# else #include #include #include "../util/mm_mullo_epi32.hh" +# endif #endif #ifdef __GNUC__ @@ -321,7 +321,7 @@ template constexpr uint32_t templ_divide16bit(uint32_t num) { >> DivisorAndLog2Table[denom].len; } -#ifndef USE_SCALAR +#if !defined(USE_SCALAR) && !defined(__ARM_NEON) template __m128i divide16bit_vec_signed(__m128i num) { static_assert(denom < 1024, "Only works for denominators < 1024"); __m128i m = _mm_set1_epi32(DivisorAndLog2Table[denom].divisor); diff --git a/src/vp8/util/block_context.hh b/src/vp8/util/block_context.hh index 78a13d2a..fd3abeef 100644 --- a/src/vp8/util/block_context.hh +++ b/src/vp8/util/block_context.hh @@ -2,12 +2,12 @@ #define BLOCK_CONTEXT_HH_ #include "options.hh" -#ifdef __aarch64__ -#define USE_SCALAR 1 -#endif - #ifndef USE_SCALAR +# ifdef __ARM_NEON +#include +# else #include "tmmintrin.h" +# endif #endif enum { @@ -39,8 +39,17 @@ struct NeighborSummary { return &edge_pixels[8]; } +#if __ARM_NEON +#define shift_right_round_zero_epi16(vec, imm8) __extension__ ({ \ + int16x8_t sign = vreinterpretq_s16_u16(vcltzq_s16(vec)); \ + int16x8_t rslt = vshrq_n_s16(vabsq_s16(vec), imm8); \ + /* ((x^0xffff) - 0xffff == not(x)+1 */ \ + rslt = veorq_s16(rslt, sign); \ + vsubq_s16(rslt, sign); \ +}) +#else #define shift_right_round_zero_epi16(vec, imm8) (_mm_sign_epi16(_mm_srli_epi16(_mm_sign_epi16(vec, vec), imm8), vec)); - +#endif void set_horizontal(int16_t * data_aligned, uint16_t* quantization_table, int16_t dc) { #ifdef USE_SCALAR for (int i = 0; i < 8 ; ++i) { @@ -48,6 +57,14 @@ struct NeighborSummary { //if (i == 7) delta = 0; edge_pixels[i + 8] = dc * quantization_table[0] + data_aligned[i + 56] + 128 * xIDCTSCALE + (delta/2); } +#elif __ARM_NEON + int16x8_t cur_row = vld1q_s16(data_aligned + 56), + prev_row = vld1q_s16(data_aligned + 48), + delta = vsubq_s16(cur_row, prev_row), + half_delta = shift_right_round_zero_epi16(delta, 1), + pred_row = vaddq_s16(vaddq_s16(cur_row, half_delta), vmovq_n_s16(128 * xIDCTSCALE)); + pred_row = vaddq_s16(pred_row, vmovq_n_s16(quantization_table[0] * dc)); + vst1q_s16(&edge_pixels[8], pred_row); #else __m128i cur_row = _mm_load_si128((const __m128i*)(data_aligned + 56)); __m128i prev_row = _mm_load_si128((const __m128i*)(data_aligned + 48)); @@ -64,8 +81,35 @@ struct NeighborSummary { for (int i = 0; i < 8 ; ++i) { int delta = data[i * 8 + 7] - data[i * 8 + 6]; //if (i == 7) delta = 0; - edge_pixels[i] = dc * quantization_table[0] + data[i * 8 + 7] + 128 * xIDCTSCALE + (delta/2); + edge_pixels[i] = dc * quantization_table[0] + data[i * 8 + 7] + 128 * xIDCTSCALE + (delta/2); } +#elif __ARM_NEON + int16_t cur_row_a[] = { + data[7], + data[15], + data[23], + data[31], + data[39], + data[47], + data[55], + data[63], + }, prev_row_a[] = { + data[6], + data[14], + data[22], + data[30], + data[38], + data[46], + data[54], + data[62], + }; + int16x8_t cur_row = vld1q_s16(cur_row_a), + prev_row = vld1q_s16(prev_row_a), + delta = vsubq_s16(cur_row,prev_row), + half_delta = shift_right_round_zero_epi16(delta, 1), + pred_row = vaddq_s16(vaddq_s16(cur_row, half_delta), vmovq_n_s16(128 * xIDCTSCALE)); + pred_row = vaddq_s16(pred_row, vmovq_n_s16(quantization_table[0] * dc)); + vst1q_s16(&edge_pixels[0], pred_row); #else __m128i cur_row = _mm_set_epi16(data[63], data[55], data[47], data[39], data[31], data[23], data[15], data[7]); __m128i prev_row = _mm_set_epi16(data[62], data[54], data[46], data[38], data[30], data[22], data[14], data[6]); diff --git a/src/vp8/util/generic_worker.cc b/src/vp8/util/generic_worker.cc index d124045b..cd934afd 100644 --- a/src/vp8/util/generic_worker.cc +++ b/src/vp8/util/generic_worker.cc @@ -1,13 +1,17 @@ #include "memory.hh" -#ifdef __aarch64__ -#define USE_SCALAR 1 -#endif - #ifndef USE_SCALAR +# if __ARM_NEON +#include +# else #include +# endif #endif +# if __ARM_ACLE +#include +# endif + #include #ifdef _WIN32 #include @@ -36,6 +40,8 @@ void _cross_platform_pause() { #if !defined(USE_SCALAR) && defined(__i386__) _mm_pause(); +#elif __ARM_ACLE + __yield(); #else #ifdef _WIN32 Sleep(0); diff --git a/src/vp8/util/memory.cc b/src/vp8/util/memory.cc index e9dbc0fb..e0d5b4ee 100644 --- a/src/vp8/util/memory.cc +++ b/src/vp8/util/memory.cc @@ -1,9 +1,9 @@ -#ifdef __aarch64__ -#define USE_SCALAR 1 -#endif - #ifndef USE_SCALAR +# if __ARM_NEON +#include +# else #include +# endif #endif #include "options.hh" @@ -134,6 +134,10 @@ void custom_free(void* ptr) { void * bzero32(void *aligned_32) { #if __AVX2__ _mm256_store_si256((__m256i*)aligned_32, _mm256_setzero_si256()); +#elif __ARM_NEON && !defined(USE_SCALAR) + int32x4_t z = vmovq_n_s32(0); + vst1q_s32((int32_t *) aligned_32, z); + vst1q_s32((int32_t *) aligned_32 + 4, z); #elif !defined(USE_SCALAR) _mm_store_si128((__m128i*)aligned_32, _mm_setzero_si128()); _mm_store_si128(((__m128i*)aligned_32) + 1, _mm_setzero_si128()); diff --git a/test_suite/test_trunc.sh b/test_suite/test_trunc.sh index fc3b3bf1..dbab73c7 100755 --- a/test_suite/test_trunc.sh +++ b/test_suite/test_trunc.sh @@ -1,5 +1,5 @@ #!/bin/sh -export INPUT_TO_TEST=`dirname $0`/../../images/iphone.jpg +export INPUT_TO_TEST=`dirname $0`/../images/iphone.jpg if [ $# -eq 0 ]; then echo "Using default file $INPUT_TO_TEST" else From 2bd3b5a34ea39388812a76cc47f9e022fba6e3b4 Mon Sep 17 00:00:00 2001 From: Michael Wallner Date: Mon, 22 Aug 2022 15:12:20 +0200 Subject: [PATCH 2/7] fix unintentinally moved x86 line --- src/lepton/idct.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lepton/idct.cc b/src/lepton/idct.cc index 041e4d7d..6354ab64 100644 --- a/src/lepton/idct.cc +++ b/src/lepton/idct.cc @@ -358,8 +358,8 @@ void idct_neon(const AlignedBlock &block, const uint16_t q[64], int16_t voutp[64 #else /* At least SSE2 is available { */ template __m128i vget_raster(const AlignedBlock&block) { - block.coefficients_raster(which_vec + 2 * stride + offset), return _mm_set_epi32(block.coefficients_raster(which_vec + 3 * stride + offset), + block.coefficients_raster(which_vec + 2 * stride + offset), block.coefficients_raster(which_vec + 1 * stride + offset), block.coefficients_raster(which_vec + offset)); } From 6835803c9d9c3fd86da352041a34130db2d7f89a Mon Sep 17 00:00:00 2001 From: Michael Wallner Date: Tue, 23 Aug 2022 13:20:50 +0200 Subject: [PATCH 3/7] align duplicate impl of aligned_memchr16ff --- src/lepton/jpgcoder.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/lepton/jpgcoder.cc b/src/lepton/jpgcoder.cc index 5623bd8f..66af62d4 100644 --- a/src/lepton/jpgcoder.cc +++ b/src/lepton/jpgcoder.cc @@ -2477,10 +2477,7 @@ bool aligned_memchr16ff(const unsigned char *local_huff_data) { #if USE_SCALAR return memchr(local_huff_data, 0xff, 16) != NULL; #elif __ARM_NEON - uint8x16_t buf = vld1q_u8(local_huff_data), - res = vceqq_u8(buf, vmovq_n_u8(~0)); - uint16_t val = vaddlvq_u8(res); - return val; + return !!vaddlvq_u8(vceqq_u8(vld1q_u8(local_huff_data), vmovq_n_u8(~0))); #else __m128i buf = _mm_load_si128((__m128i const*)local_huff_data); __m128i ff = _mm_set1_epi8(-1); From 8f962fa8fb219ad7f079af61c1f89105529c3847 Mon Sep 17 00:00:00 2001 From: Michael Wallner Date: Tue, 23 Aug 2022 13:56:17 +0200 Subject: [PATCH 4/7] fix data input mixup in adv_predict_dc_pix() --- src/vp8/model/model.hh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/vp8/model/model.hh b/src/vp8/model/model.hh index 67433afc..2931dc9b 100644 --- a/src/vp8/model/model.hh +++ b/src/vp8/model/model.hh @@ -692,8 +692,7 @@ public: #ifndef USE_SCALAR # if __ARM_NEON if (all_present || above_present) { //above goes first to prime the cache - int16x8_t neighbor_above = vld1q_s16(context.neighbor_context_above_unchecked() - .horizontal_ptr()); + int16x8_t neighbor_above = vld1q_s16(context.neighbor_context_above_unchecked().horizontal_ptr()); int16x8_t pixels_sans_dc_reg = vld1q_s16(pixels_sans_dc); int16x8_t pixels2_sans_dc_reg = vld1q_s16(pixels_sans_dc + 8); int16x8_t pixels_delta = vsubq_s16(pixels_sans_dc_reg, pixels2_sans_dc_reg); @@ -712,18 +711,18 @@ public: pixels_sans_dc[0], pixels_sans_dc[8], pixels_sans_dc[16], + pixels_sans_dc[24], pixels_sans_dc[32], pixels_sans_dc[40], - pixels_sans_dc[24], pixels_sans_dc[48], pixels_sans_dc[56], }, pixels_sans_dc_2[] = { pixels_sans_dc[1], pixels_sans_dc[9], pixels_sans_dc[17], + pixels_sans_dc[25], pixels_sans_dc[33], pixels_sans_dc[41], - pixels_sans_dc[25], pixels_sans_dc[49], pixels_sans_dc[57], }; From 6ef7a6cd4ac9bdd4da02aa6791f7d901c49d52ef Mon Sep 17 00:00:00 2001 From: Michael Wallner Date: Fri, 7 Oct 2022 21:17:47 +0200 Subject: [PATCH 5/7] use a memory barrier instead of __yield --- src/vp8/util/generic_worker.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/vp8/util/generic_worker.cc b/src/vp8/util/generic_worker.cc index cd934afd..f158eb78 100644 --- a/src/vp8/util/generic_worker.cc +++ b/src/vp8/util/generic_worker.cc @@ -8,10 +8,6 @@ # endif #endif -# if __ARM_ACLE -#include -# endif - #include #ifdef _WIN32 #include @@ -40,8 +36,8 @@ void _cross_platform_pause() { #if !defined(USE_SCALAR) && defined(__i386__) _mm_pause(); -#elif __ARM_ACLE - __yield(); +#elif __ARM_NEON + __asm__ __volatile__("dmb ish"); #else #ifdef _WIN32 Sleep(0); From 010531c9dbd37b69023d64ee2ad7228db07a10aa Mon Sep 17 00:00:00 2001 From: Michael Wallner Date: Mon, 10 Oct 2022 21:46:56 +0200 Subject: [PATCH 6/7] use an instruction synch barrier instead of memory barrier --- src/vp8/util/generic_worker.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vp8/util/generic_worker.cc b/src/vp8/util/generic_worker.cc index f158eb78..97d41ec2 100644 --- a/src/vp8/util/generic_worker.cc +++ b/src/vp8/util/generic_worker.cc @@ -37,7 +37,7 @@ void _cross_platform_pause() { #if !defined(USE_SCALAR) && defined(__i386__) _mm_pause(); #elif __ARM_NEON - __asm__ __volatile__("dmb ish"); + __asm__ __volatile__("isb"); #else #ifdef _WIN32 Sleep(0); From 0ca66b7f849ca42975a81a5de26ff2d7049704dc Mon Sep 17 00:00:00 2001 From: Michael Wallner Date: Mon, 4 Mar 2024 17:52:57 +0100 Subject: [PATCH 7/7] fix exit status race by calling exit_group --- src/io/Seccomp.cc | 7 ++++--- src/vp8/util/memory.cc | 4 ++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/io/Seccomp.cc b/src/io/Seccomp.cc index c6ab3f17..66528b93 100644 --- a/src/io/Seccomp.cc +++ b/src/io/Seccomp.cc @@ -92,6 +92,7 @@ bool installStrictSyscallFilter(bool verbose) { #endif #endif ALLOW_SYSCALL(exit), + ALLOW_SYSCALL(exit_group), ALLOW_SYSCALL(read), ALLOW_SYSCALL(write), KILL_PROCESS, @@ -100,21 +101,21 @@ bool installStrictSyscallFilter(bool verbose) { prog.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])); prog.filter = filter; if ( -#ifdef USE_STANDARD_MEMORY_ALLOCATORS +#if defined USE_STANDARD_MEMORY_ALLOCATORS || ! defined USE_STRICT_SECCOMP true #else prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) #endif ) { -#ifndef USE_STANDARD_MEMORY_ALLOCATORS +#if ! defined USE_STANDARD_MEMORY_ALLOCATORS && defined USE_STRICT_SECCOMP if (verbose) { perror("prctl(SECCOMP)"); } -#endif if (errno == EINVAL && verbose) { fprintf(stderr, "SECCOMP_MODE_STRICT is not available.\n%s", "Trying to set a filter to emulate strict mode\n"); } +#endif if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { if (verbose) { perror("prctl(NO_NEW_PRIVS)"); diff --git a/src/vp8/util/memory.cc b/src/vp8/util/memory.cc index e0d5b4ee..fe672486 100644 --- a/src/vp8/util/memory.cc +++ b/src/vp8/util/memory.cc @@ -248,7 +248,11 @@ void custom_exit(ExitCode exit_code) { } } #ifdef __linux__ +# ifdef USE_STRICT_SECCOMP syscall(SYS_exit, (int)exit_code); +# else + syscall(SYS_exit_group, (int)exit_code); +# endif #else exit((int)exit_code); #endif