[llvm] [LoopVectorize] Don't discount instructions scalarized due to tail folding (PR #109289)

Wed Nov 27 03:32:23 PST 2024

https://github.com/john-brawn-arm updated https://github.com/llvm/llvm-project/pull/109289

>From 87dceeabcc061958b444183ee9e2716a60019cb8 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Tue, 17 Sep 2024 17:05:22 +0100
Subject: [PATCH 1/2] [LoopVectorize] Don't discount instructions scalarized
 due to tail folding

When an instruction is scalarized due to tail folding the cost that
we calculate (which is then returned by getWideningCost and thus
getInstructionCost) is already the scalarized cost, so
computePredInstDiscount shouldn't apply a scalarization discount to
these instructions.

Fixes #66652
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  12 +-
 .../AArch64/conditional-branches-cost.ll      | 345 +++---------------
 .../AArch64/induction-costs-sve.ll            | 316 +---------------
 .../PowerPC/vplan-force-tail-with-evl.ll      |  30 +-
 .../LoopVectorize/X86/small-size.ll           | 301 +++++++++------
 5 files changed, 269 insertions(+), 735 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 107fb38be31969..c64f8ccb3a84f7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5501,10 +5501,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // Scale the total scalar cost by block probability.
     ScalarCost /= getReciprocalPredBlockProb();
 
-    // Compute the discount. A non-negative discount means the vector version
-    // of the instruction costs more, and scalarizing would be beneficial.
-    Discount += VectorCost - ScalarCost;
-    ScalarCosts[I] = ScalarCost;
+    // Compute the discount, unless this instruction must be scalarized due to
+    // tail folding, as then the vector cost is already the scalar cost. A
+    // non-negative discount means the vector version of the instruction costs
+    // more, and scalarizing would be beneficial.
+    if (!foldTailByMasking() || getWideningDecision(I, VF) != CM_Scalarize) {
+      Discount += VectorCost - ScalarCost;
+      ScalarCosts[I] = ScalarCost;
+    }
   }
 
   return Discount;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 9910be7224674c..de1cae7383f863 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -529,93 +529,14 @@ define void @latch_branch_cost(ptr %dst) {
 ; PRED-LABEL: define void @latch_branch_cost(
 ; PRED-SAME: ptr [[DST:%.*]]) {
 ; PRED-NEXT:  entry:
-; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PRED:       vector.ph:
-; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PRED:       vector.body:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; PRED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], <i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99>
-; PRED-NEXT:    [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
-; PRED-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; PRED:       pred.store.if:
-; PRED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; PRED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
-; PRED-NEXT:    store i8 0, ptr [[TMP3]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; PRED:       pred.store.continue:
-; PRED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
-; PRED-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
-; PRED:       pred.store.if1:
-; PRED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
-; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
-; PRED-NEXT:    store i8 0, ptr [[TMP6]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; PRED:       pred.store.continue2:
-; PRED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
-; PRED-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
-; PRED:       pred.store.if3:
-; PRED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 2
-; PRED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
-; PRED-NEXT:    store i8 0, ptr [[TMP9]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
-; PRED:       pred.store.continue4:
-; PRED-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
-; PRED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
-; PRED:       pred.store.if5:
-; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 3
-; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
-; PRED-NEXT:    store i8 0, ptr [[TMP12]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
-; PRED:       pred.store.continue6:
-; PRED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
-; PRED-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; PRED:       pred.store.if7:
-; PRED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 4
-; PRED-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
-; PRED-NEXT:    store i8 0, ptr [[TMP15]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; PRED:       pred.store.continue8:
-; PRED-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
-; PRED-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; PRED:       pred.store.if9:
-; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 5
-; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
-; PRED-NEXT:    store i8 0, ptr [[TMP18]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; PRED:       pred.store.continue10:
-; PRED-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
-; PRED-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
-; PRED:       pred.store.if11:
-; PRED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 6
-; PRED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]]
-; PRED-NEXT:    store i8 0, ptr [[TMP21]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; PRED:       pred.store.continue12:
-; PRED-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
-; PRED-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE6]]
-; PRED:       pred.store.if13:
-; PRED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 7
-; PRED-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
-; PRED-NEXT:    store i8 0, ptr [[TMP24]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; PRED:       pred.store.continue14:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; PRED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
-; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; PRED:       middle.block:
-; PRED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 104, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; PRED-NEXT:    br label [[FOR_BODY:%.*]]
 ; PRED:       loop:
-; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; PRED-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
 ; PRED-NEXT:    store i8 0, ptr [[GEP]], align 1
 ; PRED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[IV]], 1
 ; PRED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; PRED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -800,27 +721,27 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
-; PRED-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META6:![0-9]+]]
+; PRED-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META4:![0-9]+]]
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP16]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT28]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP17:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META9:![0-9]+]]
+; PRED-NEXT:    [[TMP17:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META7:![0-9]+]]
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP17]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; PRED-NEXT:    [[TMP18:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
-; PRED-NEXT:    [[TMP19:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META11:![0-9]+]]
+; PRED-NEXT:    [[TMP19:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META9:![0-9]+]]
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP19]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT31:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT30]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; PRED-NEXT:    [[TMP20:%.*]] = icmp ugt <vscale x 4 x i32> [[BROADCAST_SPLAT31]], [[TMP18]]
 ; PRED-NEXT:    [[TMP21:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i1> zeroinitializer
 ; PRED-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[D]], i64 [[TMP15]]
-; PRED-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> [[BROADCAST_SPLAT33]], i32 4, <vscale x 4 x i1> [[TMP21]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
+; PRED-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> [[BROADCAST_SPLAT33]], i32 4, <vscale x 4 x i1> [[TMP21]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; PRED-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> zeroinitializer, ptr [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP21]]), !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> zeroinitializer, ptr [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP21]]), !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; PRED-NEXT:    [[TMP24:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 4 x i1> [[TMP24]], i32 0
-; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
@@ -842,7 +763,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
 ; PRED:       loop.latch:
 ; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; PRED-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT:    br i1 [[C_1]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP20:![0-9]+]]
+; PRED-NEXT:    br i1 [[C_1]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP18:![0-9]+]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret i32 0
 ;
@@ -959,7 +880,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
 ; PRED-NEXT:    [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; PRED-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
-; PRED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; PRED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
@@ -979,7 +900,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8
 ; PRED-NEXT:    [[IV_CLAMP:%.*]] = and i64 [[IV]], 4294967294
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_CLAMP]], 512
-; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -1003,208 +924,34 @@ exit:
   ret void
 }
 
-define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
-; DEFAULT-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
+define void @low_trip_count_store(ptr %dst) {
+; DEFAULT-LABEL: define void @low_trip_count_store(
 ; DEFAULT-SAME: ptr [[DST:%.*]]) {
 ; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DEFAULT:       vector.body:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
-; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
-; DEFAULT-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
-; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], <i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6>
-; DEFAULT-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
-; DEFAULT-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; DEFAULT:       pred.store.if:
-; DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = add i8 [[TMP0]], 0
-; DEFAULT-NEXT:    store i8 [[TMP5]], ptr [[TMP4]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; DEFAULT:       pred.store.continue:
-; DEFAULT-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
-; DEFAULT:       pred.store.if1:
-; DEFAULT-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 1
-; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
-; DEFAULT-NEXT:    [[TMP9:%.*]] = add i8 [[TMP0]], 1
-; DEFAULT-NEXT:    store i8 [[TMP9]], ptr [[TMP8]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; DEFAULT:       pred.store.continue2:
-; DEFAULT-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
-; DEFAULT:       pred.store.if3:
-; DEFAULT-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 2
-; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
-; DEFAULT-NEXT:    [[TMP13:%.*]] = add i8 [[TMP0]], 2
-; DEFAULT-NEXT:    store i8 [[TMP13]], ptr [[TMP12]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE4]]
-; DEFAULT:       pred.store.continue4:
-; DEFAULT-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
-; DEFAULT:       pred.store.if5:
-; DEFAULT-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 3
-; DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]]
-; DEFAULT-NEXT:    [[TMP17:%.*]] = add i8 [[TMP0]], 3
-; DEFAULT-NEXT:    store i8 [[TMP17]], ptr [[TMP16]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; DEFAULT:       pred.store.continue6:
-; DEFAULT-NEXT:    [[TMP18:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
-; DEFAULT-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; DEFAULT:       pred.store.if7:
-; DEFAULT-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 4
-; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]]
-; DEFAULT-NEXT:    [[TMP21:%.*]] = add i8 [[TMP0]], 4
-; DEFAULT-NEXT:    store i8 [[TMP21]], ptr [[TMP20]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; DEFAULT:       pred.store.continue8:
-; DEFAULT-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
-; DEFAULT-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; DEFAULT:       pred.store.if9:
-; DEFAULT-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 5
-; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
-; DEFAULT-NEXT:    [[TMP25:%.*]] = add i8 [[TMP0]], 5
-; DEFAULT-NEXT:    store i8 [[TMP25]], ptr [[TMP24]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; DEFAULT:       pred.store.continue10:
-; DEFAULT-NEXT:    [[TMP26:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
-; DEFAULT-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
-; DEFAULT:       pred.store.if11:
-; DEFAULT-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 6
-; DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]]
-; DEFAULT-NEXT:    [[TMP29:%.*]] = add i8 [[TMP0]], 6
-; DEFAULT-NEXT:    store i8 [[TMP29]], ptr [[TMP28]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; DEFAULT:       pred.store.continue12:
-; DEFAULT-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
-; DEFAULT-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
-; DEFAULT:       pred.store.if13:
-; DEFAULT-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 7
-; DEFAULT-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP31]]
-; DEFAULT-NEXT:    [[TMP33:%.*]] = add i8 [[TMP0]], 7
-; DEFAULT-NEXT:    store i8 [[TMP33]], ptr [[TMP32]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE14]]
-; DEFAULT:       pred.store.continue14:
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; DEFAULT-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; DEFAULT:       middle.block:
-; DEFAULT-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; DEFAULT:       scalar.ph:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    br label [[LOOP:%.*]]
 ; DEFAULT:       loop:
-; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; DEFAULT-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
 ; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
 ; DEFAULT-NEXT:    store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
 ; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
-; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; DEFAULT:       exit:
 ; DEFAULT-NEXT:    ret void
 ;
-; PRED-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
+; PRED-LABEL: define void @low_trip_count_store(
 ; PRED-SAME: ptr [[DST:%.*]]) {
 ; PRED-NEXT:  entry:
-; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PRED:       vector.ph:
-; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PRED:       vector.body:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
-; PRED-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
-; PRED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], <i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6>
-; PRED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
-; PRED-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; PRED:       pred.store.if:
-; PRED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; PRED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
-; PRED-NEXT:    [[TMP5:%.*]] = add i8 [[TMP0]], 0
-; PRED-NEXT:    store i8 [[TMP5]], ptr [[TMP4]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; PRED:       pred.store.continue:
-; PRED-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
-; PRED-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
-; PRED:       pred.store.if1:
-; PRED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 1
-; PRED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
-; PRED-NEXT:    [[TMP9:%.*]] = add i8 [[TMP0]], 1
-; PRED-NEXT:    store i8 [[TMP9]], ptr [[TMP8]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; PRED:       pred.store.continue2:
-; PRED-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
-; PRED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
-; PRED:       pred.store.if3:
-; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 2
-; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
-; PRED-NEXT:    [[TMP13:%.*]] = add i8 [[TMP0]], 2
-; PRED-NEXT:    store i8 [[TMP13]], ptr [[TMP12]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
-; PRED:       pred.store.continue4:
-; PRED-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
-; PRED-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
-; PRED:       pred.store.if5:
-; PRED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 3
-; PRED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]]
-; PRED-NEXT:    [[TMP17:%.*]] = add i8 [[TMP0]], 3
-; PRED-NEXT:    store i8 [[TMP17]], ptr [[TMP16]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; PRED:       pred.store.continue6:
-; PRED-NEXT:    [[TMP18:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
-; PRED-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; PRED:       pred.store.if7:
-; PRED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 4
-; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]]
-; PRED-NEXT:    [[TMP21:%.*]] = add i8 [[TMP0]], 4
-; PRED-NEXT:    store i8 [[TMP21]], ptr [[TMP20]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; PRED:       pred.store.continue8:
-; PRED-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
-; PRED-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; PRED:       pred.store.if9:
-; PRED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 5
-; PRED-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
-; PRED-NEXT:    [[TMP25:%.*]] = add i8 [[TMP0]], 5
-; PRED-NEXT:    store i8 [[TMP25]], ptr [[TMP24]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; PRED:       pred.store.continue10:
-; PRED-NEXT:    [[TMP26:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
-; PRED-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
-; PRED:       pred.store.if11:
-; PRED-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 6
-; PRED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]]
-; PRED-NEXT:    [[TMP29:%.*]] = add i8 [[TMP0]], 6
-; PRED-NEXT:    store i8 [[TMP29]], ptr [[TMP28]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; PRED:       pred.store.continue12:
-; PRED-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
-; PRED-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
-; PRED:       pred.store.if13:
-; PRED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 7
-; PRED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP31]]
-; PRED-NEXT:    [[TMP33:%.*]] = add i8 [[TMP0]], 7
-; PRED-NEXT:    store i8 [[TMP33]], ptr [[TMP32]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE14]]
-; PRED:       pred.store.continue14:
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; PRED-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; PRED:       middle.block:
-; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; PRED-NEXT:    br label [[LOOP:%.*]]
 ; PRED:       loop:
-; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; PRED-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
 ; PRED-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
 ; PRED-NEXT:    store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
 ; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
-; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -1405,7 +1152,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
 ; DEFAULT-NEXT:    [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[TMP80]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; DEFAULT:       middle.block:
 ; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -1436,7 +1183,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
 ; DEFAULT:       loop.latch:
 ; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP27:![0-9]+]]
 ; DEFAULT:       exit:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -1628,7 +1375,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
 ; PRED-NEXT:    [[TMP84:%.*]] = xor <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
 ; PRED-NEXT:    [[TMP85:%.*]] = extractelement <8 x i1> [[TMP84]], i32 0
-; PRED-NEXT:    br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; PRED-NEXT:    br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
@@ -1658,7 +1405,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
 ; PRED:       loop.latch:
 ; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP26:![0-9]+]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP22:![0-9]+]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -1741,7 +1488,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; DEFAULT:       middle.block:
 ; DEFAULT-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; DEFAULT:       scalar.ph:
@@ -1757,7 +1504,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
 ; DEFAULT-NEXT:    [[T:%.*]] = trunc nuw nsw i64 [[IV_NEXT]] to i32
 ; DEFAULT-NEXT:    store i32 [[T]], ptr [[DST]], align 4
 ; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 21
-; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP31:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
 ; DEFAULT:       exit:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -1804,7 +1551,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; PRED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; PRED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; PRED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
@@ -1820,7 +1567,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
 ; PRED-NEXT:    [[T:%.*]] = trunc nuw nsw i64 [[IV_NEXT]] to i32
 ; PRED-NEXT:    store i32 [[T]], ptr [[DST]], align 4
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 21
-; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP28:![0-9]+]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP24:![0-9]+]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -1879,39 +1626,33 @@ attributes #2 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" }
 ; DEFAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
 ; DEFAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
 ; DEFAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
-; DEFAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META1]]}
 ; DEFAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
-; DEFAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META1]]}
-; DEFAULT: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
-; DEFAULT: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
 ;.
 ; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; PRED: [[META6]] = !{[[META7:![0-9]+]]}
-; PRED: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]}
-; PRED: [[META8]] = distinct !{[[META8]], !"LVerDomain"}
+; PRED: [[META4]] = !{[[META5:![0-9]+]]}
+; PRED: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]}
+; PRED: [[META6]] = distinct !{[[META6]], !"LVerDomain"}
+; PRED: [[META7]] = !{[[META8:![0-9]+]]}
+; PRED: [[META8]] = distinct !{[[META8]], [[META6]]}
 ; PRED: [[META9]] = !{[[META10:![0-9]+]]}
-; PRED: [[META10]] = distinct !{[[META10]], [[META8]]}
+; PRED: [[META10]] = distinct !{[[META10]], [[META6]]}
 ; PRED: [[META11]] = !{[[META12:![0-9]+]]}
-; PRED: [[META12]] = distinct !{[[META12]], [[META8]]}
-; PRED: [[META13]] = !{[[META14:![0-9]+]]}
-; PRED: [[META14]] = distinct !{[[META14]], [[META8]]}
-; PRED: [[META15]] = !{[[META16:![0-9]+]], [[META7]], [[META10]], [[META12]]}
-; PRED: [[META16]] = distinct !{[[META16]], [[META8]]}
-; PRED: [[META17]] = !{[[META16]]}
-; PRED: [[META18]] = !{[[META7]], [[META10]], [[META12]]}
+; PRED: [[META12]] = distinct !{[[META12]], [[META6]]}
+; PRED: [[META13]] = !{[[META14:![0-9]+]], [[META5]], [[META8]], [[META10]]}
+; PRED: [[META14]] = distinct !{[[META14]], [[META6]]}
+; PRED: [[META15]] = !{[[META14]]}
+; PRED: [[META16]] = !{[[META5]], [[META8]], [[META10]]}
+; PRED: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
+; PRED: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]]}
 ; PRED: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
-; PRED: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]]}
+; PRED: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
 ; PRED: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
-; PRED: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
+; PRED: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]]}
 ; PRED: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
 ; PRED: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
-; PRED: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]}
-; PRED: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]]}
-; PRED: [[LOOP27]] = distinct !{[[LOOP27]], [[META1]], [[META2]]}
-; PRED: [[LOOP28]] = distinct !{[[LOOP28]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index edba5ee1d7f9eb..3c8571212026ba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -308,71 +308,9 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; PRED-NEXT:  entry:
 ; PRED-NEXT:    [[MUL_X:%.*]] = add i32 [[X]], 1
-; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; PRED:       vector.scevcheck:
-; PRED-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
-; PRED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0
-; PRED-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]]
-; PRED-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
-; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; PRED-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
-; PRED-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
-; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
-; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0
-; PRED-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
-; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
-; PRED-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; PRED:       vector.ph:
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 2
-; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
-; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PRED:       vector.body:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
-; PRED-NEXT:    [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT:    [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64>
-; PRED-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; PRED:       pred.store.if:
-; PRED-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0
-; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
-; PRED-NEXT:    store i32 1, ptr [[TMP20]], align 4
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; PRED:       pred.store.continue:
-; PRED-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
-; PRED:       pred.store.if1:
-; PRED-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1
-; PRED-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]]
-; PRED-NEXT:    store i32 1, ptr [[TMP23]], align 4
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; PRED:       pred.store.continue2:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true>
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; PRED-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0
-; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; PRED:       middle.block:
-; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; PRED-NEXT:    br label [[FOR_BODY:%.*]]
 ; PRED:       for.body:
-; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; PRED-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32
 ; PRED-NEXT:    [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]]
 ; PRED-NEXT:    [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64
@@ -380,7 +318,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    store i32 1, ptr [[GEP]], align 4
 ; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[FOR_BODY]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -478,106 +416,20 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-LABEL: define void @trunc_ivs_and_store(
 ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; PRED-NEXT:  entry:
-; PRED-NEXT:    [[MUL:%.*]] = mul i32 [[X]], [[X]]
-; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; PRED:       vector.scevcheck:
 ; PRED-NEXT:    [[TMP1:%.*]] = mul i32 [[X]], [[X]]
-; PRED-NEXT:    [[TMP2:%.*]] = sub i32 0, [[TMP1]]
-; PRED-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0
-; PRED-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]]
-; PRED-NEXT:    [[TMP5:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]])
-; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
-; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
-; PRED-NEXT:    [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0
-; PRED-NEXT:    [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
-; PRED-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
-; PRED-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0
-; PRED-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
-; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
-; PRED-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; PRED:       vector.ph:
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
-; PRED-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PRED:       vector.body:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
-; PRED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; PRED-NEXT:    [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT:    [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64>
-; PRED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; PRED:       pred.store.if:
-; PRED-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
-; PRED-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
-; PRED-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0
-; PRED-NEXT:    store i32 [[TMP22]], ptr [[TMP21]], align 4
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; PRED:       pred.store.continue:
-; PRED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
-; PRED:       pred.store.if3:
-; PRED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
-; PRED-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]]
-; PRED-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1
-; PRED-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
-; PRED:       pred.store.continue4:
-; PRED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; PRED-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
-; PRED:       pred.store.if5:
-; PRED-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
-; PRED-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]]
-; PRED-NEXT:    [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2
-; PRED-NEXT:    store i32 [[TMP30]], ptr [[TMP29]], align 4
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; PRED:       pred.store.continue6:
-; PRED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; PRED-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
-; PRED:       pred.store.if7:
-; PRED-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
-; PRED-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]]
-; PRED-NEXT:    [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3
-; PRED-NEXT:    store i32 [[TMP34]], ptr [[TMP33]], align 4
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; PRED:       pred.store.continue8:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
-; PRED-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; PRED-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0
-; PRED-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; PRED:       middle.block:
-; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; PRED-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; PRED-NEXT:    br label [[LOOP:%.*]]
 ; PRED:       loop:
-; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
-; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
 ; PRED-NEXT:    [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
-; PRED-NEXT:    [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]]
+; PRED-NEXT:    [[IV_1_MUL:%.*]] = mul i32 [[TMP1]], [[IV_1_TRUNC]]
 ; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
 ; PRED-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64
 ; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]]
 ; PRED-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
 ; PRED-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
 ; PRED-NEXT:    [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; PRED-NEXT:    br i1 [[EXITCOND_3_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; PRED-NEXT:    br i1 [[EXITCOND_3_NOT]], label [[EXIT:%.*]], label [[LOOP]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -677,95 +529,10 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; PRED-NEXT:  entry:
 ; PRED-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
-; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; PRED:       vector.scevcheck:
-; PRED-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
-; PRED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0
-; PRED-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]]
-; PRED-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
-; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; PRED-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
-; PRED-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
-; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
-; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0
-; PRED-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
-; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
-; PRED-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; PRED:       vector.ph:
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
-; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PRED:       vector.body:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
-; PRED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; PRED-NEXT:    [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT:    [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64>
-; PRED-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; PRED:       pred.store.if:
-; PRED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
-; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
-; PRED-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0
-; PRED-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; PRED:       pred.store.continue:
-; PRED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
-; PRED:       pred.store.if2:
-; PRED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
-; PRED-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]]
-; PRED-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1
-; PRED-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE3]]
-; PRED:       pred.store.continue3:
-; PRED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; PRED-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
-; PRED:       pred.store.if4:
-; PRED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; PRED-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]]
-; PRED-NEXT:    [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2
-; PRED-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE5]]
-; PRED:       pred.store.continue5:
-; PRED-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; PRED-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
-; PRED:       pred.store.if6:
-; PRED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
-; PRED-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]]
-; PRED-NEXT:    [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3
-; PRED-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
-; PRED:       pred.store.continue7:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0
-; PRED-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; PRED:       middle.block:
-; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; PRED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; PRED-NEXT:    br label [[LOOP:%.*]]
 ; PRED:       loop:
-; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
-; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
 ; PRED-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
 ; PRED-NEXT:    [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]]
 ; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
@@ -774,7 +541,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
 ; PRED-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -854,67 +621,16 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED-LABEL: define void @exit_cond_zext_iv(
 ; PRED-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; PRED-NEXT:  entry:
-; PRED-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; PRED:       vector.scevcheck:
-; PRED-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
-; PRED-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
-; PRED-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
-; PRED-NEXT:    [[TMP3:%.*]] = add i32 1, [[TMP2]]
-; PRED-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
-; PRED-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
-; PRED-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; PRED-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; PRED:       vector.ph:
-; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
-; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
-; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
-; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
-; PRED-NEXT:    br label [[LOOP:%.*]]
-; PRED:       vector.body:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; PRED-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
-; PRED-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
-; PRED-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; PRED-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; PRED:       pred.store.if:
-; PRED-NEXT:    [[IV_CONV:%.*]] = add i64 [[INDEX]], 0
-; PRED-NEXT:    [[GEP:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV]], i32 2
-; PRED-NEXT:    store i32 0, ptr [[GEP]], align 8
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; PRED:       pred.store.continue:
-; PRED-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; PRED-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
-; PRED:       pred.store.if5:
-; PRED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 1
-; PRED-NEXT:    [[TMP13:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP12]], i32 2
-; PRED-NEXT:    store i32 0, ptr [[TMP13]], align 8
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; PRED:       pred.store.continue6:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; PRED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; PRED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
-; PRED:       middle.block:
-; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; PRED-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; PRED-NEXT:    br label [[LOOP1:%.*]]
 ; PRED:       loop:
-; PRED-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP1]] ]
-; PRED-NEXT:    [[IV_CONV1:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_EXT:%.*]], [[LOOP1]] ]
+; PRED-NEXT:    [[IV_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP1]] ]
+; PRED-NEXT:    [[IV_CONV1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_EXT:%.*]], [[LOOP1]] ]
 ; PRED-NEXT:    [[GEP1:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV1]], i32 2
 ; PRED-NEXT:    store i32 0, ptr [[GEP1]], align 8
 ; PRED-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
 ; PRED-NEXT:    [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
 ; PRED-NEXT:    [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
-; PRED-NEXT:    br i1 [[C]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; PRED-NEXT:    br i1 [[C]], label [[LOOP1]], label [[EXIT:%.*]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -956,12 +672,4 @@ attributes #0 = { "target-features"="+sve" }
 ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
-; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
-; PRED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
-; PRED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; PRED: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
-; PRED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; PRED: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
index a0696b3204dbd4..cbbb9e7ed08473 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
@@ -21,7 +21,31 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_INC:%.*]]>
 ; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     EMIT vp<[[CMP:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]>
+; CHECK-NEXT:   Successor(s): pred.load
+; CHECK-EMPTY:
+; CHECK-NEXT:  <xVFxUF> pred.load: {
+; CHECK-NEXT:    pred.load.entry:
+; CHECK-NEXT:      BRANCH-ON-MASK vp<[[CMP]]>
+; CHECK-NEXT:    Successor(s): pred.load.if, pred.load.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.load.if:
+; CHECK-NEXT:      REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT:      REPLICATE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<[[STEPS]]>
+; CHECK-NEXT:      REPLICATE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:    Successor(s): pred.load.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.load.continue:
+; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<[[LOAD0:%.+]]> = ir<%0>
+; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<[[LOAD1:%.+]]> = ir<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): for.body.1
+; CHECK-EMPTY:
+; CHECK-NEXT:   for.body.1:
+; CHECK-NEXT:     WIDEN ir<%add> = add nsw vp<[[LOAD1]]>, vp<[[LOAD0]]>
 ; CHECK-NEXT:   Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <xVFxUF> pred.store: {
@@ -30,13 +54,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; CHECK-NEXT:    Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    pred.store.if:
-; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:      REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:      REPLICATE ir<%0> = load ir<%arrayidx>
-; CHECK-NEXT:      REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<[[STEPS]]>
-; CHECK-NEXT:      REPLICATE ir<%1> = load ir<%arrayidx2>
 ; CHECK-NEXT:      REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:      REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
 ; CHECK-NEXT:      REPLICATE store ir<%add>, ir<%arrayidx4>
 ; CHECK-NEXT:    Successor(s): pred.store.continue
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index dc474fbf67ce8b..6df3ce4d898adb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -139,70 +139,107 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[N_RND_UP10:%.*]] = add nuw nsw i64 [[TMP17]], 3
 ; CHECK-NEXT:    [[N_VEC12:%.*]] = and i64 [[N_RND_UP10]], 8589934588
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_116:%.*]] = add nsw i64 [[TMP17]], -1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_116]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY19:%.*]]
-; CHECK:       vector.body17:
-; CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT31:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX20]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX20]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_116]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT22:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT21]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT22]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; CHECK:       pred.store.if23:
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = and i32 [[TMP23]], [[TMP21]]
-; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; CHECK:       pred.store.continue24:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP18]], i64 1
-; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; CHECK:       pred.store.if25:
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP33:%.*]] = and i32 [[TMP31]], [[TMP29]]
-; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; CHECK:       pred.store.continue26:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP18]], i64 2
-; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
-; CHECK:       pred.store.if27:
-; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[TMP35]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP38]], align 4
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP35]]
-; CHECK-NEXT:    [[TMP41:%.*]] = and i32 [[TMP39]], [[TMP37]]
-; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP40]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
-; CHECK:       pred.store.continue28:
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i1> [[TMP18]], i64 3
-; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
-; CHECK:       pred.store.if29:
-; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP43]]
+; CHECK-NEXT:    br label [[VECTOR_BODY17:%.*]]
+; CHECK:       vector.body17:
+; CHECK-NEXT:    [[INDEX18:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT37:%.*]], [[PRED_STORE_CONTINUE36:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX18]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX18]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT20]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT22]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP21]], i64 0
+; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP24]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP27]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY17]] ], [ [[TMP25]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY17]] ], [ [[TMP28]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP21]], i64 1
+; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
+; CHECK:       pred.load.if23:
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP33]], i64 1
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP36]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
+; CHECK:       pred.load.continue24:
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP34]], [[PRED_LOAD_IF23]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP30]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP37]], [[PRED_LOAD_IF23]] ]
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i1> [[TMP21]], i64 2
+; CHECK-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
+; CHECK:       pred.load.if25:
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load i32, ptr [[TMP41]], align 4
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP42]], i64 2
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[TMP19]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[TMP43]]
-; CHECK-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP43]]
-; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP47]], [[TMP45]]
-; CHECK-NEXT:    store i32 [[TMP49]], ptr [[TMP48]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP45]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
+; CHECK:       pred.load.continue26:
+; CHECK-NEXT:    [[TMP47:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP43]], [[PRED_LOAD_IF25]] ]
+; CHECK-NEXT:    [[TMP48:%.*]] = phi <4 x i32> [ [[TMP39]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP46]], [[PRED_LOAD_IF25]] ]
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i1> [[TMP21]], i64 3
+; CHECK-NEXT:    br i1 [[TMP49]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
+; CHECK:       pred.load.if27:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP50]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i32> [[TMP47]], i32 [[TMP51]], i64 3
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP53]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> [[TMP48]], i32 [[TMP54]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
+; CHECK:       pred.load.continue28:
+; CHECK-NEXT:    [[TMP56:%.*]] = phi <4 x i32> [ [[TMP47]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP52]], [[PRED_LOAD_IF27]] ]
+; CHECK-NEXT:    [[TMP57:%.*]] = phi <4 x i32> [ [[TMP48]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP55]], [[PRED_LOAD_IF27]] ]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <4 x i32> [[TMP57]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <4 x i1> [[TMP21]], i64 0
+; CHECK-NEXT:    br i1 [[TMP59]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK:       pred.store.if29:
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[TMP58]], i64 0
+; CHECK-NEXT:    store i32 [[TMP61]], ptr [[TMP60]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
 ; CHECK:       pred.store.continue30:
-; CHECK-NEXT:    [[INDEX_NEXT31]] = add i64 [[INDEX20]], 4
-; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC12]]
-; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY19]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i1> [[TMP21]], i64 1
+; CHECK-NEXT:    br i1 [[TMP62]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
+; CHECK:       pred.store.if31:
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i32> [[TMP58]], i64 1
+; CHECK-NEXT:    store i32 [[TMP64]], ptr [[TMP63]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE32]]
+; CHECK:       pred.store.continue32:
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i1> [[TMP21]], i64 2
+; CHECK-NEXT:    br i1 [[TMP65]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]]
+; CHECK:       pred.store.if33:
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i32> [[TMP58]], i64 2
+; CHECK-NEXT:    store i32 [[TMP67]], ptr [[TMP66]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE34]]
+; CHECK:       pred.store.continue34:
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i1> [[TMP21]], i64 3
+; CHECK-NEXT:    br i1 [[TMP68]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36]]
+; CHECK:       pred.store.if35:
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <4 x i32> [[TMP58]], i64 3
+; CHECK-NEXT:    store i32 [[TMP70]], ptr [[TMP69]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE36]]
+; CHECK:       pred.store.continue36:
+; CHECK-NEXT:    [[INDEX_NEXT37]] = add i64 [[INDEX18]], 4
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq i64 [[INDEX_NEXT37]], [[N_VEC12]]
+; CHECK-NEXT:    br i1 [[TMP71]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY17]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block7:
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH8]]
 ; CHECK:       scalar.ph8:
@@ -269,17 +306,17 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add nsw i64 [[TMP2]], -1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT13]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE20:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[OFFSET_IDX8:%.*]] = shl i64 [[INDEX]], 2
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT13]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT14]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT14]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
@@ -451,7 +488,7 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE17:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE23:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[OFFSET_IDX7:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
@@ -459,65 +496,91 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i64> [[VEC_IV]], <i64 257, i64 257, i64 257, i64 257>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX7]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[NEXT_GEP]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 7
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[NEXT_GEP8]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> poison, i16 [[TMP3]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <4 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
+; CHECK:       pred.load.if12:
+; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[NEXT_GEP4]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i16> [[TMP5]], i16 [[TMP8]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
+; CHECK:       pred.load.continue13:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i16> [ [[TMP5]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP9]], [[PRED_LOAD_IF12]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
+; CHECK:       pred.load.if14:
+; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[NEXT_GEP5]], align 2
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i16> [[TMP10]], i16 [[TMP13]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
+; CHECK:       pred.load.continue15:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i16> [ [[TMP10]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP14]], [[PRED_LOAD_IF14]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
+; CHECK:       pred.load.if16:
+; CHECK-NEXT:    [[TMP17:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[NEXT_GEP6]], align 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i16> [[TMP15]], i16 [[TMP18]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
+; CHECK:       pred.load.continue17:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i16> [ [[TMP15]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP19]], [[PRED_LOAD_IF16]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <4 x i16> [[TMP20]] to <4 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw <4 x i32> [[TMP21]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0
+; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP22]], i64 0
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[NEXT_GEP8]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
-; CHECK:       pred.store.if12:
-; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX7]], 4
-; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[NEXT_GEP4]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP9]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i32 [[TMP10]], 7
-; CHECK-NEXT:    store i32 [[TMP11]], ptr [[NEXT_GEP9]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
-; CHECK:       pred.store.continue13:
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
-; CHECK:       pred.store.if14:
-; CHECK-NEXT:    [[TMP13:%.*]] = or disjoint i64 [[OFFSET_IDX7]], 8
-; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[NEXT_GEP5]], align 2
-; CHECK-NEXT:    [[TMP16:%.*]] = zext i16 [[TMP15]] to i32
-; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 7
-; CHECK-NEXT:    store i32 [[TMP17]], ptr [[NEXT_GEP10]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.continue15:
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17]]
-; CHECK:       pred.store.if16:
-; CHECK-NEXT:    [[TMP19:%.*]] = or disjoint i64 [[OFFSET_IDX7]], 12
-; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[NEXT_GEP6]], align 2
-; CHECK-NEXT:    [[TMP22:%.*]] = zext i16 [[TMP21]] to i32
-; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7
-; CHECK-NEXT:    store i32 [[TMP23]], ptr [[NEXT_GEP11]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE17]]
-; CHECK:       pred.store.continue17:
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1
+; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; CHECK:       pred.store.if18:
+; CHECK-NEXT:    [[TMP26:%.*]] = or disjoint i64 [[OFFSET_IDX7]], 4
+; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP26]]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP22]], i64 1
+; CHECK-NEXT:    store i32 [[TMP27]], ptr [[NEXT_GEP9]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; CHECK:       pred.store.continue19:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; CHECK:       pred.store.if20:
+; CHECK-NEXT:    [[TMP29:%.*]] = or disjoint i64 [[OFFSET_IDX7]], 8
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP22]], i64 2
+; CHECK-NEXT:    store i32 [[TMP30]], ptr [[NEXT_GEP10]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; CHECK:       pred.store.continue21:
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3
+; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23]]
+; CHECK:       pred.store.if22:
+; CHECK-NEXT:    [[TMP32:%.*]] = or disjoint i64 [[OFFSET_IDX7]], 12
+; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP32]]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[TMP22]], i64 3
+; CHECK-NEXT:    store i32 [[TMP33]], ptr [[NEXT_GEP11]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; CHECK:       pred.store.continue23:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[TMP26:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br i1 true, label [[TMP36:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[TMP25:%.*]]
-; CHECK:       25:
-; CHECK-NEXT:    br i1 poison, label [[TMP26]], label [[TMP25]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       26:
+; CHECK-NEXT:    br label [[TMP35:%.*]]
+; CHECK:       35:
+; CHECK-NEXT:    br i1 poison, label [[TMP36]], label [[TMP35]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       36:
 ; CHECK-NEXT:    ret void
 ;
   br label %1

>From b9f8c9cf5b7133a2b1dfa8ea2f8f100ec3022eb8 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Wed, 2 Oct 2024 16:29:40 +0100
Subject: [PATCH 2/2] Add cost model test

---
 .../CostModel/AArch64/tail-folding.ll         | 281 ++++++++++++++++++
 1 file changed, 281 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/tail-folding.ll

diff --git a/llvm/test/Analysis/CostModel/AArch64/tail-folding.ll b/llvm/test/Analysis/CostModel/AArch64/tail-folding.ll
new file mode 100644
index 00000000000000..ed84b73bd21284
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/tail-folding.ll
@@ -0,0 +1,281 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "fold tail" --filter "estimated cost" --filter "costs" --filter "Selecting VF" --filter "loop costs" --version 5
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s
+
+; REQUIRE: asserts
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; These tests check that if the only way to vectorize is to tail fold a store by
+; masking then we properly account for the cost of creating a predicated block
+; for each vector element.
+
+define void @store_const_fixed_trip_count(ptr %dst) {
+; CHECK-LABEL: 'store_const_fixed_trip_count'
+; CHECK:  LV: can fold tail by masking.
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 1, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Scalar loop costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 1, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Scalar loop costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 1, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 2 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 2 costs: 5.
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 16 for VF 4 For instruction: store i8 1, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 2 for VF 4 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 4 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 4 costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 32 for VF 8 For instruction: store i8 1, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 4 for VF 8 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 8 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 8 costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 64 for VF 16 For instruction: store i8 1, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 8 for VF 16 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 16 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 16 costs: 4.
+; CHECK:  LV: Selecting VF: 1.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 1, ptr %gep, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 7
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @store_trunc_iv_fixed_trip_count(ptr %dst) {
+; CHECK-LABEL: 'store_trunc_iv_fixed_trip_count'
+; CHECK:  LV: can fold tail by masking.
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Scalar loop costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Scalar loop costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 2 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 2 costs: 6.
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 20 for VF 4 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 2 for VF 4 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 4 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 4 costs: 5.
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 40 for VF 8 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 4 for VF 8 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 8 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 8 costs: 5.
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 80 for VF 16 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 8 for VF 16 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 16 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 16 costs: 5.
+; CHECK:  LV: Selecting VF: 1.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.trunc = trunc i64 %iv to i8
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 %iv.trunc, ptr %gep, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 7
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; With an unknown trip count we need to use optsize otherwise we use a scalar
+; epilogue instead of tail folding.
+define void @store_const_unknown_trip_count(ptr %dst, i64 %limit) optsize {
+; CHECK-LABEL: 'store_const_unknown_trip_count'
+; CHECK:  LV: can fold tail by masking.
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Scalar loop costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Scalar loop costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 2 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 2 costs: 6.
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 20 for VF 4 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 2 for VF 4 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 4 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 4 costs: 5.
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 40 for VF 8 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 4 for VF 8 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 8 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 8 costs: 5.
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 80 for VF 16 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 8 for VF 16 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 16 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 16 costs: 5.
+; CHECK:  LV: Selecting VF: 1.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.trunc = trunc i64 %iv to i8
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 %iv.trunc, ptr %gep, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %limit
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @store_trunc_iv_unknown_trip_count(ptr %dst, i64 %limit) optsize {
+; CHECK-LABEL: 'store_trunc_iv_unknown_trip_count'
+; CHECK:  LV: can fold tail by masking.
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Scalar loop costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Scalar loop costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 2 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 2 costs: 6.
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 20 for VF 4 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 2 for VF 4 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 4 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 4 costs: 5.
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 40 for VF 8 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 4 for VF 8 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 8 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 8 costs: 5.
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %iv.trunc = trunc i64 %iv to i8
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 80 for VF 16 For instruction: store i8 %iv.trunc, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 8 for VF 16 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 16 For instruction: %ec = icmp eq i64 %iv.next, %limit
+; CHECK:  LV: Found an estimated cost of 0 for VF 16 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 16 costs: 5.
+; CHECK:  LV: Selecting VF: 1.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.trunc = trunc i64 %iv to i8
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 %iv.trunc, ptr %gep, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %limit
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}