[AArch64] Replace expensive move from wzr by two moves via floating point immediate (#146538)

juliannagele · web-flow · commit e333d6019daf · 2025-07-16T12:41:59.000+02:00
We've noticed that inserting 0 into a known vector lane is implemented via a move from wzr, i.e., moving between register banks. We think it will be cheaper (and have seen improvements on our benchmarks) to materialize 0 into a floating point register and insert from there. PR: #146538
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -840,6 +840,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
                                                "HasDisableFastIncVL", "true",
                                                "Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;
 
+// On most processors we want to avoid moving from WZR to vector registers
+// (relying on materializing 0 to a FPR and moving from there instead),
+// but on some (in-order) cores it's preferable to avoid the extra instruction instead.
+def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
+                                              "UseWzrToVecMove", "true",
+                                              "Move from WZR to insert 0 into vector registers">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -419,6 +419,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
 
 def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;
 
+def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
+
 
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
@@ -7377,6 +7379,7 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
               (i64 0)),
             dsub)>;
 
+let Predicates = [UseWzrToVecMove] in {
 def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
           (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
 def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
@@ -7387,6 +7390,7 @@ def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm))
           (EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>;
 def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)),
           (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
+}
 
 def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
             (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
                                    "Cortex-A320 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeatureBalanceFPOps,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA55     : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
                                    "Cortex-A55 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureFuseAddress]>;
+                                   FeatureFuseAddress,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA510    : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
                                    "Cortex-A510 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove
                                    ]>;
 
 def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA520AE  : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520AE ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors", [
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -172,8 +172,9 @@ define <8 x half> @test_insert_v8f16_insert_1(half %a) {
 ; CHECK-LABEL: test_insert_v8f16_insert_1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    dup.8h v0, v0[0]
-; CHECK-NEXT:    mov.h v0[7], wzr
+; CHECK-NEXT:    mov.h v0[7], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <8 x half> <half undef, half undef, half undef, half undef, half undef, half undef, half undef, half 0.0>, half %a, i32 0
   %v.1 = insertelement <8 x half> %v.0, half %a, i32 1
@@ -278,8 +279,9 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) {
 ; CHECK-LABEL: test_insert_3_f32_undef_zero_vector:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    dup.4s v0, v0[0]
-; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    mov.s v0[3], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, float %a, i32 0
   %v.1 = insertelement <4 x float> %v.0, float %a, i32 1
@@ -347,12 +349,12 @@ define <8 x i16> @test_insert_v8i16_i16_zero(<8 x i16> %a) {
   ret <8 x i16> %v.0
 }
 
-; TODO: This should jsut be a mov.s v0[3], wzr
 define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
 ; CHECK-LABEL: test_insert_v4f16_f16_zero:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov.h v0[0], wzr
+; CHECK-NEXT:    mov.h v0[0], v1[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
@@ -362,7 +364,8 @@ define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
 define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
 ; CHECK-LABEL: test_insert_v8f16_f16_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov.h v0[6], wzr
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov.h v0[6], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
   ret <8 x half> %v.0
@@ -371,8 +374,9 @@ define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
 define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
 ; CHECK-LABEL: test_insert_v2f32_f32_zero:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov.s v0[0], wzr
+; CHECK-NEXT:    mov.s v0[0], v1[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
@@ -382,7 +386,8 @@ define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
 define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
 ; CHECK-LABEL: test_insert_v4f32_f32_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov.s v0[3], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
   ret <4 x float> %v.0
@@ -391,8 +396,60 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
 define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) {
 ; CHECK-LABEL: test_insert_v2f64_f64_zero:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov.d v0[1], v1[0]
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
+  ret <2 x double> %v.0
+}
+
+define <4 x half> @test_insert_v4f16_f16_zero_wzr(<4 x half> %a) #1 {
+; CHECK-LABEL: test_insert_v4f16_f16_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov.h v0[0], wzr
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
+  ret <4 x half> %v.0
+}
+
+define <8 x half> @test_insert_v8f16_f16_zero_wzr(<8 x half> %a) #1 {
+; CHECK-LABEL: test_insert_v8f16_f16_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov.h v0[6], wzr
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
+  ret <8 x half> %v.0
+}
+
+define <2 x float> @test_insert_v2f32_f32_zero_wzr(<2 x float> %a) #1 {
+; CHECK-LABEL: test_insert_v2f32_f32_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov.s v0[0], wzr
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
+  ret <2 x float> %v.0
+}
+
+define <4 x float> @test_insert_v4f32_f32_zero_wzr(<4 x float> %a) #1 {
+; CHECK-LABEL: test_insert_v4f32_f32_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
+  ret <4 x float> %v.0
+}
+
+define <2 x double> @test_insert_v2f64_f64_zero_xzr(<2 x double> %a) #1 {
+; CHECK-LABEL: test_insert_v2f64_f64_zero_xzr:
+; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov.d v0[1], xzr
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
   ret <2 x double> %v.0
 }
+
+attributes #1 = {"tune-cpu"="cortex-a55"}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -63,8 +63,9 @@ define half @add_v3HalfH(<3 x half> %bin.rdx)  {
 ;
 ; CHECK-SD-FP16-LABEL: add_v3HalfH:
 ; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    movi d1, #0000000000000000
 ; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-FP16-NEXT:    mov v0.h[3], wzr
+; CHECK-SD-FP16-NEXT:    mov v0.h[3], v1.h[0]
 ; CHECK-SD-FP16-NEXT:    faddp v0.4h, v0.4h, v0.4h
 ; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
 ; CHECK-SD-FP16-NEXT:    ret