Skip to content

Commit

Permalink
Unify register assignment in NEON assembly for get_sad
Browse files Browse the repository at this point in the history
Easier to read and reduces compressed compiled object size by 10%.
  • Loading branch information
barrbrain committed Dec 4, 2023
1 parent b532690 commit 60dd8a8
Showing 1 changed file with 61 additions and 78 deletions.
139 changes: 61 additions & 78 deletions src/arm/64/sad.S
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
* Copyright (c) 2020-2023, The rav1e contributors. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
Expand All @@ -14,23 +15,12 @@

.macro sad_rect width, height
function sad\width\()x\height\()_neon, export=1
.if \width == 128
movi v3.4s, #0
.else
movi v0.4s, #0
.endif
sxtw x1, w1
.if \width == 128
movi v18.4s, #0
.endif
sxtw x3, w3
mov w4, \height
.if \width == 128
mov v2.16b, v3.16b
.elseif \width >= 32
.if \width >= 16
mov v1.16b, v0.16b
.elseif \width == 16
mov v3.16b, v0.16b
.endif
b L(sad_w\width\())
endfunc
Expand All @@ -42,12 +32,12 @@ function sad4x4_neon, export=1
sxtw x3, w3
mov w4, #4
L(sad_w4):
ldr d1, [x0]
ldr d2, [x2]
ldr s2, [x0]
ldr s3, [x2]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabal v0.8h, v1.8b, v2.8b
uabal v0.8h, v2.8b, v3.8b
bne L(sad_w4)
uaddlp v0.2s, v0.4h
uaddlp v0.1d, v0.2s
Expand Down Expand Up @@ -86,25 +76,21 @@ function sad64x64_neon, export=1
mov w4, #64
mov v1.16b, v0.16b
L(sad_w64):
ldr q16, [x0]
ldr q17, [x2]
ldr q6, [x0, #16]
ldr q7, [x2, #16]
ldr q4, [x0, #32]
ldr q5, [x2, #32]
ldr q2, [x0, #48]
ldr q3, [x2, #48]
ldp q2, q4, [x0]
ldp q3, q5, [x2]
ldp q6, q16, [x0, #32]
ldp q7, q17, [x2, #32]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabal v0.8h, v16.8b, v17.8b
uabal2 v1.8h, v16.16b, v17.16b
uabal v0.8h, v6.8b, v7.8b
uabal2 v1.8h, v6.16b, v7.16b
uabal v0.8h, v4.8b, v5.8b
uabal2 v1.8h, v4.16b, v5.16b
uabal v0.8h, v2.8b, v3.8b
uabal2 v1.8h, v2.16b, v3.16b
uabal v0.8h, v4.8b, v5.8b
uabal2 v1.8h, v4.16b, v5.16b
uabal v0.8h, v6.8b, v7.8b
uabal2 v1.8h, v6.16b, v7.16b
uabal v0.8h, v16.8b, v17.8b
uabal2 v1.8h, v16.16b, v17.16b
bne L(sad_w64)
horizontal_long_add_16x8
endfunc
Expand All @@ -114,48 +100,47 @@ sad_rect 64, 32
sad_rect 64, 128

function sad128x128_neon, export=1
movi v3.4s, #0
movi v0.4s, #0
sxtw x1, w1
movi v18.4s, #0
sxtw x3, w3
mov w4, #128
mov v2.16b, v3.16b
mov v1.16b, v0.16b
L(sad_w128):
ldp q0, q25, [x0]
ldp q28, q26, [x2]
ldp q23, q21, [x0, #32]
ldp q24, q22, [x2, #32]
ldp q19, q16, [x0, #64]
ldp q20, q17, [x2, #64]
ldp q6, q4, [x0, #96]
ldp q7, q5, [x2, #96]
ldp q2, q4, [x0]
ldp q3, q5, [x2]
ldp q6, q16, [x0, #32]
ldp q7, q17, [x2, #32]
ldp q18, q20, [x0, #64]
ldp q19, q21, [x2, #64]
ldp q22, q24, [x0, #96]
ldp q23, q25, [x2, #96]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabdl v18.8h, v0.8b, v28.8b
uabal2 v18.8h, v0.16b, v28.16b
uabal v18.8h, v25.8b, v26.8b
uabal2 v18.8h, v25.16b, v26.16b
uabal v18.8h, v23.8b, v24.8b
uabal2 v18.8h, v23.16b, v24.16b
uabal v18.8h, v21.8b, v22.8b
uabal2 v18.8h, v21.16b, v22.16b
uabal v18.8h, v19.8b, v20.8b
uabal2 v18.8h, v19.16b, v20.16b
uabal v18.8h, v16.8b, v17.8b
uabal2 v18.8h, v16.16b, v17.16b
uabal v18.8h, v6.8b, v7.8b
uabal2 v18.8h, v6.16b, v7.16b
uabal v18.8h, v4.8b, v5.8b
uabal2 v18.8h, v4.16b, v5.16b
uaddw v3.4s, v3.4s, v18.4h
uaddw2 v2.4s, v2.4s, v18.8h
uabdl v26.8h, v2.8b, v3.8b
uabal2 v26.8h, v2.16b, v3.16b
uabal v26.8h, v4.8b, v5.8b
uabal2 v26.8h, v4.16b, v5.16b
uabal v26.8h, v6.8b, v7.8b
uabal2 v26.8h, v6.16b, v7.16b
uabal v26.8h, v16.8b, v17.8b
uabal2 v26.8h, v16.16b, v17.16b
uabal v26.8h, v18.8b, v19.8b
uabal2 v26.8h, v18.16b, v19.16b
uabal v26.8h, v20.8b, v21.8b
uabal2 v26.8h, v20.16b, v21.16b
uabal v26.8h, v22.8b, v23.8b
uabal2 v26.8h, v22.16b, v23.16b
uabal v26.8h, v24.8b, v25.8b
uabal2 v26.8h, v24.16b, v25.16b
uaddw v1.4s, v1.4s, v26.4h
uaddw2 v0.4s, v0.4s, v26.8h
bne L(sad_w128)
add v2.4s, v2.4s, v3.4s
uaddlp v2.2d, v2.4s
dup d0, v2.d[1]
add v2.2s, v0.2s, v2.2s
umov w0, v2.s[0]
add v0.4s, v0.4s, v1.4s
uaddlp v0.2d, v0.4s
dup d3, v0.d[1]
add v0.2s, v0.2s, v3.2s
umov w0, v0.s[0]
ret
endfunc

Expand All @@ -168,17 +153,15 @@ function sad32x32_neon, export=1
mov w4, #32
mov v1.16b, v0.16b
L(sad_w32):
ldr q4, [x0]
ldr q5, [x2]
ldr q2, [x0, #16]
ldr q3, [x2, #16]
ldp q2, q4, [x0]
ldp q3, q5, [x2]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabal v1.8h, v4.8b, v5.8b
uabal2 v0.8h, v4.16b, v5.16b
uabal v1.8h, v2.8b, v3.8b
uabal2 v0.8h, v2.16b, v3.16b
uabal v1.8h, v4.8b, v5.8b
uabal2 v0.8h, v4.16b, v5.16b
bne L(sad_w32)
add v0.8h, v0.8h, v1.8h
horizontal_add_16x8
Expand All @@ -193,17 +176,17 @@ function sad16x16_neon, export=1
sxtw x1, w1
sxtw x3, w3
mov w4, #16
mov v3.16b, v0.16b
mov v1.16b, v0.16b
L(sad_w16):
ldr q1, [x0]
ldr q2, [x2]
ldr q2, [x0]
ldr q3, [x2]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabal v0.8h, v1.8b, v2.8b
uabal2 v3.8h, v1.16b, v2.16b
uabal v0.8h, v2.8b, v3.8b
uabal2 v1.8h, v2.16b, v3.16b
bne L(sad_w16)
add v0.8h, v0.8h, v3.8h
add v0.8h, v0.8h, v1.8h
horizontal_add_16x8
endfunc

Expand All @@ -218,12 +201,12 @@ function sad8x8_neon, export=1
sxtw x3, w3
mov w4, #8
L(sad_w8):
ldr d1, [x0]
ldr d2, [x2]
ldr d2, [x0]
ldr d3, [x2]
add x0, x0, x1
add x2, x2, x3
subs w4, w4, #1
uabal v0.8h, v1.8b, v2.8b
uabal v0.8h, v2.8b, v3.8b
bne L(sad_w8)
horizontal_add_16x8
endfunc
Expand Down

0 comments on commit 60dd8a8

Please sign in to comment.