diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 992f98cec0010..7fa8f5f4e5c9b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5306,6 +5306,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
       Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
       UseMaskForGaps);
 
+  // Add the address computation cost.
+  Cost += TTI.getAddressComputationCost(WideVecTy);
+
   if (Group->isReverse()) {
     // TODO: Add support for reversed masked interleaved access.
     assert(!Legal->isMaskRequired(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 75ade13b09d9c..73d84588fbe87 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3666,6 +3666,9 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
       InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
       IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
 
+  // Add the address computation cost.
+  Cost += Ctx.TTI.getAddressComputationCost(WideVecTy);
+
   if (!IG->isReverse())
     return Cost;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index a550f1ca14c8b..2214a9963ee07 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -13,15 +13,15 @@ entry:
   br label %for.body
 
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_2'
-; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
+; VF_8:          Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp2, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 3 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_2'
-; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
+; VF_16:         Found an estimated cost of 3 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp2, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 3 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0
@@ -44,20 +44,20 @@ entry:
   br label %for.body
 
 ; VF_4-LABEL: Checking a loop in 'i16_factor_2'
-; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
+; VF_4:          Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 3 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_2'
-; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
+; VF_8:          Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 3 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_2'
-; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
+; VF_16:         Found an estimated cost of 5 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 5 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 0
@@ -80,25 +80,25 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
-; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
+; VF_2:          Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
-; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
+; VF_4:          Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 3 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_8-LABEL:  Checking a loop in 'i32_factor_2'
-; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
+; VF_8:          Found an estimated cost of 5 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 5 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_2'
-; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
+; VF_16:         Found an estimated cost of 9 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 9 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 0
@@ -121,25 +121,25 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i64_factor_2'
-; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_2:          Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 3 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_2'
-; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_4:          Found an estimated cost of 5 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 5 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_2'
-; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_8:          Found an estimated cost of 9 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 9 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_2'
-; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_16:         Found an estimated cost of 17 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 17 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll
index 5f13c8e9ac22e..b04c594f46823 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll
@@ -95,7 +95,7 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 }
 
 ; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride2'
-; CHECK: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction:   %0 = load float, ptr %arrayidx, align 4
+; CHECK: LV: Found an estimated cost of 3 for VF vscale x 4 For instruction:   %0 = load float, ptr %arrayidx, align 4
 define void @gather_nxv4i32_stride2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index 173766cc0a656..2917bc0404bfa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -8,46 +8,31 @@ define void @test_complex_add_float(ptr %res, ptr noalias %A, ptr noalias %B, i6
 ; CHECK-LABEL: define void @test_complex_add_float(
 ; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <8 x float>, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x float> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -84,32 +69,24 @@ define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i
 ; CHECK-LABEL: define void @test_complex_add_double(
 ; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = load <2 x double>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = load <2 x double>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = load <2 x double>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x double> [[STRIDED_VEC5]], [[STRIDED_VEC11]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]]
-; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    store <2 x double> [[TMP15]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
index 214d9abd712cd..3cc98fd4c18d3 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@@ -13,15 +13,15 @@ entry:
   br label %for.body
 
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_2'
-; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
+; VF_8:          Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp2, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 3 for VF 8 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_2'
-; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
+; VF_16:         Found an estimated cost of 3 for VF 16 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp2, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 3 for VF 16 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0
@@ -44,20 +44,20 @@ entry:
   br label %for.body
 
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_2'
-; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
+; VF_4:          Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 3 for VF 4 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_2'
-; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
+; VF_8:          Found an estimated cost of 3 for VF 8 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 3 for VF 8 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_2'
-; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
+; VF_16:         Found an estimated cost of 5 for VF 16 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 5 for VF 16 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.2, ptr %data, i64 %i, i32 0
@@ -80,25 +80,25 @@ entry:
   br label %for.body
 
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
-; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
+; VF_2:          Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
-; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
+; VF_4:          Found an estimated cost of 3 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 3 for VF 4 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_8-LABEL:  Checking a loop in 'i32_factor_2'
-; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
+; VF_8:          Found an estimated cost of 5 for VF 8 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 5 for VF 8 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_2'
-; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
+; VF_16:         Found an estimated cost of 9 for VF 16 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 9 for VF 16 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.2, ptr %data, i64 %i, i32 0
@@ -121,15 +121,15 @@ entry:
   br label %for.body
 
 ; VF_4-LABEL: Checking a loop in 'half_factor_2'
-; VF_4:         Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2
+; VF_4:         Found an estimated cost of 41 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2
 ; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store half %tmp2, ptr %tmp0, align 2
-; VF_4-NEXT:    Found an estimated cost of 40 for VF 4 For instruction: store half %tmp3, ptr %tmp1, align 2
+; VF_4-NEXT:    Found an estimated cost of 41 for VF 4 For instruction: store half %tmp3, ptr %tmp1, align 2
 ; VF_8-LABEL: Checking a loop in 'half_factor_2'
-; VF_8:         Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, ptr %tmp0, align 2
+; VF_8:         Found an estimated cost of 81 for VF 8 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, ptr %tmp1, align 2
 ; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half %tmp2, ptr %tmp0, align 2
-; VF_8-NEXT:    Found an estimated cost of 80 for VF 8 For instruction: store half %tmp3, ptr %tmp1, align 2
+; VF_8-NEXT:    Found an estimated cost of 81 for VF 8 For instruction: store half %tmp3, ptr %tmp1, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %half.2, ptr %data, i64 %i, i32 0