Skip to content

Commit e333d60

Browse files
authored
[AArch64] Replace expensive move from wzr by two moves via floating point immediate (#146538)
We've noticed that inserting 0 into a known vector lane is implemented via a move from wzr, i.e., moving between register banks. We think it will be cheaper (and have seen improvements on our benchmarks) to materialize 0 into a floating point register and insert from there. PR: #146538
1 parent 5b8c15c commit e333d60

File tree

5 files changed

+89
-14
lines changed

5 files changed

+89
-14
lines changed

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
840840
"HasDisableFastIncVL", "true",
841841
"Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;
842842

843+
// On most processors we want to avoid moving from WZR to vector registers
844+
// (relying on materializing 0 to a FPR and moving from there instead),
845+
// but on some (in-order) cores it's preferable to avoid the extra instruction instead.
846+
def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
847+
"UseWzrToVecMove", "true",
848+
"Move from WZR to insert 0 into vector registers">;
849+
843850
//===----------------------------------------------------------------------===//
844851
// Architectures.
845852
//

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
419419

420420
def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;
421421

422+
def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
423+
422424

423425
//===----------------------------------------------------------------------===//
424426
// AArch64-specific DAG Nodes.
@@ -7377,6 +7379,7 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
73777379
(i64 0)),
73787380
dsub)>;
73797381

7382+
let Predicates = [UseWzrToVecMove] in {
73807383
def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
73817384
(INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
73827385
def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
@@ -7387,6 +7390,7 @@ def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm))
73877390
(EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>;
73887391
def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)),
73897392
(INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
7393+
}
73907394

73917395
def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
73927396
(f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
2121
"Cortex-A320 ARM processors", [
2222
FeatureFuseAES,
2323
FeatureFuseAdrpAdd,
24-
FeaturePostRAScheduler]>;
24+
FeaturePostRAScheduler,
25+
FeatureUseWzrToVecMove]>;
2526

2627
def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
2728
"Cortex-A53 ARM processors", [
2829
FeatureFuseAES,
2930
FeatureFuseAdrpAdd,
3031
FeatureBalanceFPOps,
31-
FeaturePostRAScheduler]>;
32+
FeaturePostRAScheduler,
33+
FeatureUseWzrToVecMove]>;
3234

3335
def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
3436
"Cortex-A55 ARM processors", [
3537
FeatureFuseAES,
3638
FeatureFuseAdrpAdd,
3739
FeaturePostRAScheduler,
38-
FeatureFuseAddress]>;
40+
FeatureFuseAddress,
41+
FeatureUseWzrToVecMove]>;
3942

4043
def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
4144
"Cortex-A510 ARM processors", [
4245
FeatureFuseAES,
4346
FeatureFuseAdrpAdd,
44-
FeaturePostRAScheduler
47+
FeaturePostRAScheduler,
48+
FeatureUseWzrToVecMove
4549
]>;
4650

4751
def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
4852
"Cortex-A520 ARM processors", [
4953
FeatureFuseAES,
5054
FeatureFuseAdrpAdd,
51-
FeaturePostRAScheduler]>;
55+
FeaturePostRAScheduler,
56+
FeatureUseWzrToVecMove]>;
5257

5358
def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
5459
"Cortex-A520AE ARM processors", [
5560
FeatureFuseAES,
5661
FeatureFuseAdrpAdd,
57-
FeaturePostRAScheduler]>;
62+
FeaturePostRAScheduler,
63+
FeatureUseWzrToVecMove]>;
5864

5965
def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
6066
"Cortex-A57 ARM processors", [

llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,9 @@ define <8 x half> @test_insert_v8f16_insert_1(half %a) {
172172
; CHECK-LABEL: test_insert_v8f16_insert_1:
173173
; CHECK: // %bb.0:
174174
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
175+
; CHECK-NEXT: movi d1, #0000000000000000
175176
; CHECK-NEXT: dup.8h v0, v0[0]
176-
; CHECK-NEXT: mov.h v0[7], wzr
177+
; CHECK-NEXT: mov.h v0[7], v1[0]
177178
; CHECK-NEXT: ret
178179
%v.0 = insertelement <8 x half> <half undef, half undef, half undef, half undef, half undef, half undef, half undef, half 0.0>, half %a, i32 0
179180
%v.1 = insertelement <8 x half> %v.0, half %a, i32 1
@@ -278,8 +279,9 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) {
278279
; CHECK-LABEL: test_insert_3_f32_undef_zero_vector:
279280
; CHECK: // %bb.0:
280281
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
282+
; CHECK-NEXT: movi d1, #0000000000000000
281283
; CHECK-NEXT: dup.4s v0, v0[0]
282-
; CHECK-NEXT: mov.s v0[3], wzr
284+
; CHECK-NEXT: mov.s v0[3], v1[0]
283285
; CHECK-NEXT: ret
284286
%v.0 = insertelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, float %a, i32 0
285287
%v.1 = insertelement <4 x float> %v.0, float %a, i32 1
@@ -347,12 +349,12 @@ define <8 x i16> @test_insert_v8i16_i16_zero(<8 x i16> %a) {
347349
ret <8 x i16> %v.0
348350
}
349351

350-
; TODO: This should jsut be a mov.s v0[3], wzr
351352
define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
352353
; CHECK-LABEL: test_insert_v4f16_f16_zero:
353354
; CHECK: // %bb.0:
355+
; CHECK-NEXT: movi d1, #0000000000000000
354356
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
355-
; CHECK-NEXT: mov.h v0[0], wzr
357+
; CHECK-NEXT: mov.h v0[0], v1[0]
356358
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
357359
; CHECK-NEXT: ret
358360
%v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
@@ -362,7 +364,8 @@ define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
362364
define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
363365
; CHECK-LABEL: test_insert_v8f16_f16_zero:
364366
; CHECK: // %bb.0:
365-
; CHECK-NEXT: mov.h v0[6], wzr
367+
; CHECK-NEXT: movi d1, #0000000000000000
368+
; CHECK-NEXT: mov.h v0[6], v1[0]
366369
; CHECK-NEXT: ret
367370
%v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
368371
ret <8 x half> %v.0
@@ -371,8 +374,9 @@ define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
371374
define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
372375
; CHECK-LABEL: test_insert_v2f32_f32_zero:
373376
; CHECK: // %bb.0:
377+
; CHECK-NEXT: movi d1, #0000000000000000
374378
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
375-
; CHECK-NEXT: mov.s v0[0], wzr
379+
; CHECK-NEXT: mov.s v0[0], v1[0]
376380
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
377381
; CHECK-NEXT: ret
378382
%v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
@@ -382,7 +386,8 @@ define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
382386
define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
383387
; CHECK-LABEL: test_insert_v4f32_f32_zero:
384388
; CHECK: // %bb.0:
385-
; CHECK-NEXT: mov.s v0[3], wzr
389+
; CHECK-NEXT: movi d1, #0000000000000000
390+
; CHECK-NEXT: mov.s v0[3], v1[0]
386391
; CHECK-NEXT: ret
387392
%v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
388393
ret <4 x float> %v.0
@@ -391,8 +396,60 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
391396
define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) {
392397
; CHECK-LABEL: test_insert_v2f64_f64_zero:
393398
; CHECK: // %bb.0:
399+
; CHECK-NEXT: movi d1, #0000000000000000
400+
; CHECK-NEXT: mov.d v0[1], v1[0]
401+
; CHECK-NEXT: ret
402+
%v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
403+
ret <2 x double> %v.0
404+
}
405+
406+
define <4 x half> @test_insert_v4f16_f16_zero_wzr(<4 x half> %a) #1 {
407+
; CHECK-LABEL: test_insert_v4f16_f16_zero_wzr:
408+
; CHECK: // %bb.0:
409+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
410+
; CHECK-NEXT: mov.h v0[0], wzr
411+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
412+
; CHECK-NEXT: ret
413+
%v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
414+
ret <4 x half> %v.0
415+
}
416+
417+
define <8 x half> @test_insert_v8f16_f16_zero_wzr(<8 x half> %a) #1 {
418+
; CHECK-LABEL: test_insert_v8f16_f16_zero_wzr:
419+
; CHECK: // %bb.0:
420+
; CHECK-NEXT: mov.h v0[6], wzr
421+
; CHECK-NEXT: ret
422+
%v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
423+
ret <8 x half> %v.0
424+
}
425+
426+
define <2 x float> @test_insert_v2f32_f32_zero_wzr(<2 x float> %a) #1 {
427+
; CHECK-LABEL: test_insert_v2f32_f32_zero_wzr:
428+
; CHECK: // %bb.0:
429+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
430+
; CHECK-NEXT: mov.s v0[0], wzr
431+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
432+
; CHECK-NEXT: ret
433+
%v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
434+
ret <2 x float> %v.0
435+
}
436+
437+
define <4 x float> @test_insert_v4f32_f32_zero_wzr(<4 x float> %a) #1 {
438+
; CHECK-LABEL: test_insert_v4f32_f32_zero_wzr:
439+
; CHECK: // %bb.0:
440+
; CHECK-NEXT: mov.s v0[3], wzr
441+
; CHECK-NEXT: ret
442+
%v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
443+
ret <4 x float> %v.0
444+
}
445+
446+
define <2 x double> @test_insert_v2f64_f64_zero_xzr(<2 x double> %a) #1 {
447+
; CHECK-LABEL: test_insert_v2f64_f64_zero_xzr:
448+
; CHECK: // %bb.0:
394449
; CHECK-NEXT: mov.d v0[1], xzr
395450
; CHECK-NEXT: ret
396451
%v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
397452
ret <2 x double> %v.0
398453
}
454+
455+
attributes #1 = {"tune-cpu"="cortex-a55"}

llvm/test/CodeGen/AArch64/vecreduce-fadd.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,9 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) {
6363
;
6464
; CHECK-SD-FP16-LABEL: add_v3HalfH:
6565
; CHECK-SD-FP16: // %bb.0:
66+
; CHECK-SD-FP16-NEXT: movi d1, #0000000000000000
6667
; CHECK-SD-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
67-
; CHECK-SD-FP16-NEXT: mov v0.h[3], wzr
68+
; CHECK-SD-FP16-NEXT: mov v0.h[3], v1.h[0]
6869
; CHECK-SD-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h
6970
; CHECK-SD-FP16-NEXT: faddp h0, v0.2h
7071
; CHECK-SD-FP16-NEXT: ret

0 commit comments

Comments
 (0)