From 92c8faa67d105dc880096250e40f0faf6bc9952b Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Fri, 11 Jul 2025 01:38:50 -0700 Subject: [PATCH 1/2] [VPlan] Consider address computation cost in VPInterleaveRecipe. As a side-effect of #106431, we started vectorising loops such as: ```cpp void s351(float *a, float *b, float *c, long n) { float alpha = c[0]; for (int i = 0; i < n; i += 5) { a[i] += alpha * b[i]; a[i + 1] += alpha * b[i + 1]; a[i + 2] += alpha * b[i + 2]; a[i + 3] += alpha * b[i + 3]; a[i + 4] += alpha * b[i + 4]; } } ``` (https://godbolt.org/z/o1fGfj6j7), which resulted in a slowdown of 54%. The estimated cost of the scalar and vectorised versions is currently very close (51 vs 50.5 per lane), in favour of the vectorised version. This patch helps VPlan favour the scalar version by taking into account the cost of address computations in VPInterleaveRecipe, as is already done for the scalar version in LoopVectorizationCostModel::getMemoryInstructionCost and for other recipes such as VPWidenMemoryRecipe. This affects ``` LLVM :: Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll ``` due to the new cost estimate. I've updated the test accordingly, but please let me know if this is not appropriate. As mentioned in #82218, ideally the loop above should be rerolled before attempting vectorisation. I'm hoping the approach in this patch is a reasonble alternative in its own right in the meantime. If there's a better suggestion to achieve this, please let me know. :) --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 + ...-narrow-interleave-to-widen-memory-cost.ll | 77 +++++++------------ 2 files changed, 30 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 75ade13b09d9c..73d84588fbe87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3666,6 +3666,9 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps); + // Add the address computation cost. + Cost += Ctx.TTI.getAddressComputationCost(WideVecTy); + if (!IG->isReverse()) return Cost; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll index 173766cc0a656..2917bc0404bfa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll @@ -8,46 +8,31 @@ define void @test_complex_add_float(ptr %res, ptr noalias %A, ptr noalias %B, i6 ; CHECK-LABEL: define void @test_complex_add_float( ; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[GEP_A_0]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC5:%.*]] = load <8 x float>, ptr [[GEP_B_0]], align 4 -; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC6]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[STRIDED_VEC3]], [[STRIDED_VEC9]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC7]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[STRIDED_VEC4]], [[STRIDED_VEC10]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <8 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC3]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC4]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] @@ -84,32 +69,24 @@ define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i ; CHECK-LABEL: define void @test_complex_add_double( ; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = load <2 x double>, ptr [[TMP3]], align 4 -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = load <2 x double>, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = load <2 x double>, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]] -; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[STRIDED_VEC5]], [[STRIDED_VEC11]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]] -; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[TMP10]], align 4 -; CHECK-NEXT: store <2 x double> [[TMP15]], ptr [[TMP11]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] From b3d5fba20e62f7290dd23efac9b32e1540bcb414 Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Tue, 15 Jul 2025 04:21:50 -0700 Subject: [PATCH 2/2] Update cost of LoopVectorizationCostModel::getInterleaveGroupCost too. --- .../Transforms/Vectorize/LoopVectorize.cpp | 3 ++ .../LoopVectorize/AArch64/interleaved_cost.ll | 52 +++++++++---------- .../AArch64/sve-gather-scatter-cost.ll | 2 +- .../LoopVectorize/ARM/interleaved_cost.ll | 44 ++++++++-------- 4 files changed, 52 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 992f98cec0010..7fa8f5f4e5c9b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5306,6 +5306,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); + // Add the address computation cost. + Cost += TTI.getAddressComputationCost(WideVecTy); + if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. assert(!Legal->isMaskRequired(I) && diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index a550f1ca14c8b..2214a9963ee07 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -13,15 +13,15 @@ entry: br label %for.body ; VF_8-LABEL: Checking a loop in 'i8_factor_2' -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 +; VF_8: Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 3 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_2' -; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 +; VF_16: Found an estimated cost of 3 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 3 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0 @@ -44,20 +44,20 @@ entry: br label %for.body ; VF_4-LABEL: Checking a loop in 'i16_factor_2' -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_4: Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 3 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_2' -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_8: Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 3 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_2' -; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_16: Found an estimated cost of 5 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 5 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 0 @@ -80,25 +80,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i32_factor_2' -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_2: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_2' -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_4: Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 3 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_8-LABEL: Checking a loop in 'i32_factor_2' -; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_8: Found an estimated cost of 5 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 5 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_2' -; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_16: Found an estimated cost of 9 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 9 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 0 @@ -121,25 +121,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i64_factor_2' -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_2: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 3 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_2' -; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_4: Found an estimated cost of 5 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 5 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_2' -; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_8: Found an estimated cost of 9 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 9 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_2' -; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_16: Found an estimated cost of 17 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 17 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll index 5f13c8e9ac22e..b04c594f46823 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll @@ -95,7 +95,7 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo } ; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride2' -; CHECK: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %0 = load float, ptr %arrayidx, align 4 +; CHECK: LV: Found an estimated cost of 3 for VF vscale x 4 For instruction: %0 = load float, ptr %arrayidx, align 4 define void @gather_nxv4i32_stride2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 { entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll index 214d9abd712cd..3cc98fd4c18d3 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -13,15 +13,15 @@ entry: br label %for.body ; VF_8-LABEL: Checking a loop in 'i8_factor_2' -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 +; VF_8: Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 3 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_2' -; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 +; VF_16: Found an estimated cost of 3 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 3 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0 @@ -44,20 +44,20 @@ entry: br label %for.body ; VF_4-LABEL: Checking a loop in 'i16_factor_2' -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_4: Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 3 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_2' -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_8: Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 3 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_2' -; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_16: Found an estimated cost of 5 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 5 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 0 @@ -80,25 +80,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i32_factor_2' -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_2: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_2' -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_4: Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 3 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_8-LABEL: Checking a loop in 'i32_factor_2' -; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_8: Found an estimated cost of 5 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 5 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_2' -; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_16: Found an estimated cost of 9 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 9 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 0 @@ -121,15 +121,15 @@ entry: br label %for.body ; VF_4-LABEL: Checking a loop in 'half_factor_2' -; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2 +; VF_4: Found an estimated cost of 41 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half %tmp2, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store half %tmp3, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 41 for VF 4 For instruction: store half %tmp3, ptr %tmp1, align 2 ; VF_8-LABEL: Checking a loop in 'half_factor_2' -; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, ptr %tmp0, align 2 +; VF_8: Found an estimated cost of 81 for VF 8 For instruction: %tmp2 = load half, ptr %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, ptr %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half %tmp2, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 80 for VF 8 For instruction: store half %tmp3, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 81 for VF 8 For instruction: store half %tmp3, ptr %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %half.2, ptr %data, i64 %i, i32 0