Skip to content

Commit 540908f

Browse files
committed
[TEST] Switch NTT to st4 and use scalar loads
Signed-off-by: Matthias J. Kannwischer <[email protected]>
1 parent 6e9e43f commit 540908f

File tree

4 files changed

+1874
-1610
lines changed

4 files changed

+1874
-1610
lines changed

dev/aarch64_clean/src/ntt.S

Lines changed: 38 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,15 @@
113113
trn1 \data1\().2d, t1.2d, t3.2d
114114
.endm
115115

116+
// w_scalar load optimization: load 128-bit vector using two 64-bit scalar loads
117+
// This allows better interleaving
118+
.macro ldr_vo_scalar vec, base, offset
119+
ldr x10, [\base, #\offset]
120+
ldr x11, [\base, #(\offset + 8)]
121+
ins \vec\().d[0], x10
122+
ins \vec\().d[1], x11
123+
.endm
124+
116125
.macro save_vregs
117126
sub sp, sp, #(16*4)
118127
stp d8, d9, [sp, #16*0]
@@ -145,6 +154,11 @@
145154
inp .req x3
146155
count .req x4
147156
wtmp .req w5
157+
in2 .req x6
158+
159+
// Scalar temporaries for w_scalar load optimization
160+
xtmp0 .req x10
161+
xtmp1 .req x11
148162

149163
data0 .req v8
150164
data1 .req v9
@@ -208,14 +222,14 @@ MLD_ASM_FN_SYMBOL(ntt_asm)
208222

209223
ntt_layer123_start:
210224

211-
ldr q_data0, [in, #0]
212-
ldr q_data1, [in, #(1*(1024/8))]
213-
ldr q_data2, [in, #(2*(1024/8))]
214-
ldr q_data3, [in, #(3*(1024/8))]
215-
ldr q_data4, [in, #(4*(1024/8))]
216-
ldr q_data5, [in, #(5*(1024/8))]
217-
ldr q_data6, [in, #(6*(1024/8))]
218-
ldr q_data7, [in, #(7*(1024/8))]
225+
ldr_vo_scalar data0, in, 0
226+
ldr_vo_scalar data1, in, (1*(1024/8))
227+
ldr_vo_scalar data2, in, (2*(1024/8))
228+
ldr_vo_scalar data3, in, (3*(1024/8))
229+
ldr_vo_scalar data4, in, (4*(1024/8))
230+
ldr_vo_scalar data5, in, (5*(1024/8))
231+
ldr_vo_scalar data6, in, (6*(1024/8))
232+
ldr_vo_scalar data7, in, (7*(1024/8))
219233

220234
ct_butterfly data0, data4, root0, 0, 1
221235
ct_butterfly data1, data5, root0, 0, 1
@@ -248,19 +262,22 @@ ntt_layer123_start:
248262
cbnz count, ntt_layer123_start
249263

250264
mov in, inp
265+
add in2, in, #64 // in2 points 64 bytes ahead for data4-7
251266
mov count, #8
252267

253268
.p2align 2
254269
ntt_layer45678_start:
255270

256-
ldr q_data0, [in, #(16*0)]
257-
ldr q_data1, [in, #(16*1)]
258-
ldr q_data2, [in, #(16*2)]
259-
ldr q_data3, [in, #(16*3)]
260-
ldr q_data4, [in, #(16*4)]
261-
ldr q_data5, [in, #(16*5)]
262-
ldr q_data6, [in, #(16*6)]
263-
ldr q_data7, [in, #(16*7)]
271+
// Load data0-3 from in (bytes 0-63)
272+
ldr_vo_scalar data0, in, (16*0)
273+
ldr_vo_scalar data1, in, (16*1)
274+
ldr_vo_scalar data2, in, (16*2)
275+
ldr_vo_scalar data3, in, (16*3)
276+
// Load data4-7 from in2 (bytes 64-127)
277+
ldr_vo_scalar data4, in2, (16*0)
278+
ldr_vo_scalar data5, in2, (16*1)
279+
ldr_vo_scalar data6, in2, (16*2)
280+
ldr_vo_scalar data7, in2, (16*3)
264281

265282
load_next_roots_456
266283

@@ -300,17 +317,10 @@ ntt_layer45678_start:
300317
ct_butterfly_v data6, data7, root2, root2_tw
301318
// Bounds: |data{i}| < 9q
302319

303-
transpose4 data0, data1, data2, data3
304-
transpose4 data4, data5, data6, data7
305-
306-
str q_data0, [in], #(16*8)
307-
str q_data1, [in, #(-16*7)]
308-
str q_data2, [in, #(-16*6)]
309-
str q_data3, [in, #(-16*5)]
310-
str q_data4, [in, #(-16*4)]
311-
str q_data5, [in, #(-16*3)]
312-
str q_data6, [in, #(-16*2)]
313-
str q_data7, [in, #(-16*1)]
320+
st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64
321+
st4 {data4.4S, data5.4S, data6.4S, data7.4S}, [in2], #64
322+
add in, in, #64
323+
add in2, in2, #64
314324

315325
subs count, count, #1
316326
cbnz count, ntt_layer45678_start
@@ -325,6 +335,7 @@ ntt_layer45678_start:
325335
.unreq inp
326336
.unreq count
327337
.unreq wtmp
338+
.unreq in2
328339
.unreq data0
329340
.unreq data1
330341
.unreq data2

dev/aarch64_opt/src/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ all: ntt.S \
6363
ntt.S: ../../aarch64_clean/src/ntt.S
6464
# optimize first loop in one go and write to temp file
6565
$(eval TMPFILE := $(shell mktemp))
66-
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $(TMPFILE) -l ntt_layer123_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
66+
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $(TMPFILE) -l ntt_layer123_start $(SLOTHY_FLAGS) -c reserved_regs="[x1,x2,x3,x18--x30,sp]"
6767
# optimize second loop using split heuristic
6868
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $(TMPFILE) -o $@ -l ntt_layer45678_start $(SLOTHY_FLAGS_SPLIT) $(RESERVE_X_ONLY_FLAG)
6969

0 commit comments

Comments
 (0)