diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 992f98cec0010..7fa8f5f4e5c9b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5306,6 +5306,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); + // Add the address computation cost. + Cost += TTI.getAddressComputationCost(WideVecTy); + if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. assert(!Legal->isMaskRequired(I) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 75ade13b09d9c..73d84588fbe87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3666,6 +3666,9 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps); + // Add the address computation cost. + Cost += Ctx.TTI.getAddressComputationCost(WideVecTy); + if (!IG->isReverse()) return Cost; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index a550f1ca14c8b..2214a9963ee07 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -13,15 +13,15 @@ entry: br label %for.body ; VF_8-LABEL: Checking a loop in 'i8_factor_2' -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 +; VF_8: Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 3 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_2' -; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 +; VF_16: Found an estimated cost of 3 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 3 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0 @@ -44,20 +44,20 @@ entry: br label %for.body ; VF_4-LABEL: Checking a loop in 'i16_factor_2' -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_4: Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 3 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_2' -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_8: Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 3 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_2' -; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_16: Found an estimated cost of 5 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 5 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 0 @@ -80,25 +80,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i32_factor_2' -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_2: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_2' -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_4: Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 3 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_8-LABEL: Checking a loop in 'i32_factor_2' -; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_8: Found an estimated cost of 5 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 5 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_2' -; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_16: Found an estimated cost of 9 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 9 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 0 @@ -121,25 +121,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i64_factor_2' -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_2: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 3 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_2' -; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_4: Found an estimated cost of 5 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 5 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_2' -; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_8: Found an estimated cost of 9 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 9 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_2' -; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_16: Found an estimated cost of 17 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 17 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll index 5f13c8e9ac22e..b04c594f46823 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll @@ -95,7 +95,7 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo } ; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride2' -; CHECK: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %0 = load float, ptr %arrayidx, align 4 +; CHECK: LV: Found an estimated cost of 3 for VF vscale x 4 For instruction: %0 = load float, ptr %arrayidx, align 4 define void @gather_nxv4i32_stride2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 { entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll index 173766cc0a656..2917bc0404bfa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll @@ -8,46 +8,31 @@ define void @test_complex_add_float(ptr %res, ptr noalias %A, ptr noalias %B, i6 ; CHECK-LABEL: define void @test_complex_add_float( ; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[GEP_A_0]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC5:%.*]] = load <8 x float>, ptr [[GEP_B_0]], align 4 -; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC6]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[STRIDED_VEC3]], [[STRIDED_VEC9]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC7]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[STRIDED_VEC4]], [[STRIDED_VEC10]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <8 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC3]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC4]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] @@ -84,32 +69,24 @@ define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i ; CHECK-LABEL: define void @test_complex_add_double( ; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = load <2 x double>, ptr [[TMP3]], align 4 -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = load <2 x double>, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = load <2 x double>, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]] -; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[STRIDED_VEC5]], [[STRIDED_VEC11]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]] -; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[TMP10]], align 4 -; CHECK-NEXT: store <2 x double> [[TMP15]], ptr [[TMP11]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll index 214d9abd712cd..3cc98fd4c18d3 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -13,15 +13,15 @@ entry: br label %for.body ; VF_8-LABEL: Checking a loop in 'i8_factor_2' -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 +; VF_8: Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 3 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_2' -; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 +; VF_16: Found an estimated cost of 3 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 3 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0 @@ -44,20 +44,20 @@ entry: br label %for.body ; VF_4-LABEL: Checking a loop in 'i16_factor_2' -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_4: Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 3 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_2' -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_8: Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 3 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_2' -; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_16: Found an estimated cost of 5 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 5 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 0 @@ -80,25 +80,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i32_factor_2' -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_2: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_2' -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_4: Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 3 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_8-LABEL: Checking a loop in 'i32_factor_2' -; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_8: Found an estimated cost of 5 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 5 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_2' -; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_16: Found an estimated cost of 9 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 9 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 0 @@ -121,15 +121,15 @@ entry: br label %for.body ; VF_4-LABEL: Checking a loop in 'half_factor_2' -; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2 +; VF_4: Found an estimated cost of 41 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half %tmp2, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store half %tmp3, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 41 for VF 4 For instruction: store half %tmp3, ptr %tmp1, align 2 ; VF_8-LABEL: Checking a loop in 'half_factor_2' -; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, ptr %tmp0, align 2 +; VF_8: Found an estimated cost of 81 for VF 8 For instruction: %tmp2 = load half, ptr %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, ptr %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half %tmp2, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 80 for VF 8 For instruction: store half %tmp3, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 81 for VF 8 For instruction: store half %tmp3, ptr %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %half.2, ptr %data, i64 %i, i32 0