113113 trn1 \data1\().2d , t1.2d , t3.2d
114114.endm
115115
116+ // w_scalar load optimization: load 128 - bit vector using two 64 - bit scalar loads
117+ // This allows better interleaving
118+ .macro ldr_vo_scalar vec , base , offset
119+ ldr x10 , [ \base , #\offset ]
120+ ldr x11 , [ \base , #(\offset + 8 ) ]
121+ ins \vec\().d [ 0 ], x10
122+ ins \vec\().d [ 1 ], x11
123+ .endm
124+
116125.macro save_vregs
117126 sub sp , sp , #( 16 * 4 )
118127 stp d8 , d9 , [ sp , # 16 * 0 ]
145154 inp .req x3
146155 count .req x4
147156 wtmp .req w5
157+ in2 .req x6
158+
159+ // Scalar temporaries for w_scalar load optimization
160+ xtmp0 .req x10
161+ xtmp1 .req x11
148162
149163 data0 .req v8
150164 data1 .req v9
@@ -208,14 +222,14 @@ MLD_ASM_FN_SYMBOL(ntt_asm)
208222
209223ntt_layer123_start:
210224
211- ldr q_data0 , [ in , # 0 ]
212- ldr q_data1 , [ in , # ( 1 * ( 1024 / 8 )) ]
213- ldr q_data2 , [ in , # ( 2 * ( 1024 / 8 )) ]
214- ldr q_data3 , [ in , # ( 3 * ( 1024 / 8 )) ]
215- ldr q_data4 , [ in , # ( 4 * ( 1024 / 8 )) ]
216- ldr q_data5 , [ in , # ( 5 * ( 1024 / 8 )) ]
217- ldr q_data6 , [ in , # ( 6 * ( 1024 / 8 )) ]
218- ldr q_data7 , [ in , # ( 7 * ( 1024 / 8 )) ]
225+ ldr_vo_scalar data0 , in , 0
226+ ldr_vo_scalar data1 , in , ( 1 * ( 1024 / 8 ))
227+ ldr_vo_scalar data2 , in , ( 2 * ( 1024 / 8 ))
228+ ldr_vo_scalar data3 , in , ( 3 * ( 1024 / 8 ))
229+ ldr_vo_scalar data4 , in , ( 4 * ( 1024 / 8 ))
230+ ldr_vo_scalar data5 , in , ( 5 * ( 1024 / 8 ))
231+ ldr_vo_scalar data6 , in , ( 6 * ( 1024 / 8 ))
232+ ldr_vo_scalar data7 , in , ( 7 * ( 1024 / 8 ))
219233
220234 ct_butterfly data0 , data4 , root0 , 0 , 1
221235 ct_butterfly data1 , data5 , root0 , 0 , 1
@@ -248,19 +262,22 @@ ntt_layer123_start:
248262 cbnz count , ntt_layer123_start
249263
250264 mov in , inp
265+ add in2 , in , # 64 // in2 points 64 bytes ahead for data4 - 7
251266 mov count , # 8
252267
253268 .p2align 2
254269ntt_layer45678_start:
255270
256- ldr q_data0 , [ in , #( 16 * 0 ) ]
257- ldr q_data1 , [ in , #( 16 * 1 ) ]
258- ldr q_data2 , [ in , #( 16 * 2 ) ]
259- ldr q_data3 , [ in , #( 16 * 3 ) ]
260- ldr q_data4 , [ in , #( 16 * 4 ) ]
261- ldr q_data5 , [ in , #( 16 * 5 ) ]
262- ldr q_data6 , [ in , #( 16 * 6 ) ]
263- ldr q_data7 , [ in , #( 16 * 7 ) ]
271+ // Load data0 - 3 from in (bytes 0 - 63 )
272+ ldr_vo_scalar data0 , in , ( 16 * 0 )
273+ ldr_vo_scalar data1 , in , ( 16 * 1 )
274+ ldr_vo_scalar data2 , in , ( 16 * 2 )
275+ ldr_vo_scalar data3 , in , ( 16 * 3 )
276+ // Load data4 - 7 from in2 (bytes 64 - 127 )
277+ ldr_vo_scalar data4 , in2 , ( 16 * 0 )
278+ ldr_vo_scalar data5 , in2 , ( 16 * 1 )
279+ ldr_vo_scalar data6 , in2 , ( 16 * 2 )
280+ ldr_vo_scalar data7 , in2 , ( 16 * 3 )
264281
265282 load_next_roots_456
266283
@@ -300,17 +317,10 @@ ntt_layer45678_start:
300317 ct_butterfly_v data6 , data7 , root2 , root2_tw
301318 // Bounds: |data{i}| < 9q
302319
303- transpose4 data0 , data1 , data2 , data3
304- transpose4 data4 , data5 , data6 , data7
305-
306- str q_data0 , [ in ], #( 16 * 8 )
307- str q_data1 , [ in , #( - 16 * 7 ) ]
308- str q_data2 , [ in , #( - 16 * 6 ) ]
309- str q_data3 , [ in , #( - 16 * 5 ) ]
310- str q_data4 , [ in , #( - 16 * 4 ) ]
311- str q_data5 , [ in , #( - 16 * 3 ) ]
312- str q_data6 , [ in , #( - 16 * 2 ) ]
313- str q_data7 , [ in , #( - 16 * 1 ) ]
320+ st4 {data0.4S , data1.4S , data2.4S , data3.4S} , [ in ], # 64
321+ st4 {data4.4S , data5.4S , data6.4S , data7.4S} , [ in2 ], # 64
322+ add in , in , # 64
323+ add in2 , in2 , # 64
314324
315325 subs count , count , # 1
316326 cbnz count , ntt_layer45678_start
@@ -325,6 +335,7 @@ ntt_layer45678_start:
325335 .unreq inp
326336 .unreq count
327337 .unreq wtmp
338+ .unreq in2
328339 .unreq data0
329340 .unreq data1
330341 .unreq data2
0 commit comments