[llvm] [LV][VPlan] Implement VPlan-based cost for exit condition. (PR #125640)
Elvis Wang via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 9 21:41:31 PDT 2025
https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/125640
>From c45d2307598af8763e466cd1e927993221ab9b07 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 9 Mar 2025 19:43:22 -0700
Subject: [PATCH 1/4] [LV][VPlan] Implement VPlan-based cost for exit
condition.
This patch tried to model the cost of exit conditions through
vplan-based cost model.
* `BranchOnCount` will generate icmp + br.
The branch instruction is already implemented by the VPRegionBlock so
we only need to calculate the cost of icmp.
If the VF is same as the trip count of the loop, the cost of the
BranchOnCount is free.
This patch is not quite NFC for following reasons.
* Some of the BranchOnCount could be optimized to BranchOnCond, which is
free.
* Some of the instructions calculated in the exit condition in legacy
cost model will be optimized out.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 45 +-----
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 ++
.../AArch64/clamped-trip-count.ll | 80 ++++++-----
.../AArch64/conditional-branches-cost.ll | 54 ++++++--
.../AArch64/fully-unrolled-cost.ll | 12 +-
.../AArch64/induction-costs-sve.ll | 128 ++++++-----------
.../AArch64/partial-reduce-dot-product.ll | 32 ++---
.../AArch64/uniform-args-call-variants.ll | 131 ++++++++++++------
.../LoopVectorize/ARM/mve-icmpcost.ll | 21 +--
.../LoopVectorize/RISCV/low-trip-count.ll | 22 +--
.../LoopVectorize/RISCV/short-trip-count.ll | 34 ++---
.../X86/CostModel/vpinstruction-cost.ll | 7 +-
.../LoopVectorize/X86/cost-model.ll | 70 +++++-----
.../LoopVectorize/X86/multi-exit-cost.ll | 21 ++-
.../LoopVectorize/X86/reduction-small-size.ll | 4 +-
15 files changed, 338 insertions(+), 336 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb860a472d8f7..03f76148db515 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7290,46 +7290,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
}
}
- /// Compute the cost of all exiting conditions of the loop using the legacy
- /// cost model. This is to match the legacy behavior, which adds the cost of
- /// all exit conditions. Note that this over-estimates the cost, as there will
- /// be a single condition to control the vector loop.
- SmallVector<BasicBlock *> Exiting;
- CM.TheLoop->getExitingBlocks(Exiting);
- SetVector<Instruction *> ExitInstrs;
- // Collect all exit conditions.
- for (BasicBlock *EB : Exiting) {
- auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
- if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
- continue;
- if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
- ExitInstrs.insert(CondI);
- }
- }
- // Compute the cost of all instructions only feeding the exit conditions.
- for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
- Instruction *CondI = ExitInstrs[I];
- if (!OrigLoop->contains(CondI) ||
- !CostCtx.SkipCostComputation.insert(CondI).second)
- continue;
- InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
- LLVM_DEBUG({
- dbgs() << "Cost of " << CondICost << " for VF " << VF
- << ": exit condition instruction " << *CondI << "\n";
- });
- Cost += CondICost;
- for (Value *Op : CondI->operands()) {
- auto *OpI = dyn_cast<Instruction>(Op);
- if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
- any_of(OpI->users(), [&ExitInstrs, this](User *U) {
- return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
- !ExitInstrs.contains(cast<Instruction>(U));
- }))
- continue;
- ExitInstrs.insert(OpI);
- }
- }
-
// The legacy cost model has special logic to compute the cost of in-loop
// reductions, which may be smaller than the sum of all instructions involved
// in the reduction.
@@ -7486,6 +7446,11 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
// comparing against the legacy cost isn't desirable.
if (isa<VPPartialReductionRecipe>(&R))
return true;
+
+ // The legacy cost model will under estimate the cost of BranchOnCount if exit condition were explicit contructed in the vplan.
+ if (VPInstruction *VPI = dyn_cast<VPInstruction>(&R); VPI && VPI->getOpcode() == VPInstruction::BranchOnCount)
+ return true;
+
if (Instruction *UI = GetInstructionForCost(&R))
SeenInstrs.insert(UI);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d154d54c37862..5516d6209a0ac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -743,6 +743,19 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
return Ctx.TTI.getArithmeticReductionCost(
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
+ case VPInstruction::BranchOnCount: {
+ if (getUnderlyingValue())
+ // BranchOnCount will genearte icmp_eq + br instructions and the
+ // cost of branch will be calculated in VPRegionBlock.
+ // If the vector loop only executed once, ignore the cost of the cmp.
+ Type *ValTy = Ctx.Types.inferScalarType(getOperand(0));
+ auto TC = dyn_cast_if_present<ConstantInt>(
+ getParent()->getPlan()->getTripCount()->getUnderlyingValue());
+ if (TC && VF.isFixed() && TC->getSExtValue() == VF.getFixedValue())
+ return 0;
+ return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy, nullptr,
+ CmpInst::ICMP_EQ, Ctx.CostKind);
+ }
default:
// TODO: Compute cost other VPInstructions once the legacy cost model has
// been retired.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 3f2aef84a6a4c..65a38638aa37c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -8,39 +8,41 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 8)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VAL]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP6]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
-; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], splat (i64 3)
-; CHECK-NEXT: [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
-; CHECK-NEXT: [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 3)
+; CHECK-NEXT: [[TMP11:%.*]] = lshr <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
+; CHECK-NEXT: [[TMP16:%.*]] = trunc <vscale x 2 x i64> [[TMP11]] to <vscale x 2 x i8>
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 8)
+; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[TMP14]], i32 0
+; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -94,39 +96,41 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VAL]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP6]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
-; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], splat (i64 3)
-; CHECK-NEXT: [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
-; CHECK-NEXT: [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 3)
+; CHECK-NEXT: [[TMP11:%.*]] = lshr <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
+; CHECK-NEXT: [[TMP16:%.*]] = trunc <vscale x 2 x i64> [[TMP11]] to <vscale x 2 x i8>
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[TMP14]], i32 0
+; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index cf4fc143fe8c3..5fd68ce3b735c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -768,9 +768,20 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; DEFAULT-LABEL: define void @multiple_exit_conditions(
; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
; DEFAULT-NEXT: entry:
-; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], 8
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP9]]
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; DEFAULT: vector.ph:
-; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
+; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
+; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]]
+; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP5:%.*]] = mul i64 [[TMP10]], 8
+; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 8
+; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 2
; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
; DEFAULT: vector.body:
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -778,20 +789,39 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
-; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP1]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+; DEFAULT-NEXT: [[TMP11:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
+; DEFAULT-NEXT: [[TMP12:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
+; DEFAULT-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
+; DEFAULT-NEXT: [[TMP14:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
+; DEFAULT-NEXT: [[TMP15:%.*]] = uitofp <vscale x 2 x i16> [[TMP11]] to <vscale x 2 x double>
+; DEFAULT-NEXT: [[TMP16:%.*]] = uitofp <vscale x 2 x i16> [[TMP12]] to <vscale x 2 x double>
+; DEFAULT-NEXT: [[TMP17:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
+; DEFAULT-NEXT: [[TMP18:%.*]] = uitofp <vscale x 2 x i16> [[TMP14]] to <vscale x 2 x double>
; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[TMP4]], align 8
-; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; DEFAULT-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; DEFAULT-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 2
+; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP21]]
+; DEFAULT-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 4
+; DEFAULT-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP24]]
+; DEFAULT-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 6
+; DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP27]]
+; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP15]], ptr [[TMP4]], align 8
+; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP16]], ptr [[TMP22]], align 8
+; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP17]], ptr [[TMP25]], align 8
+; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP18]], ptr [[TMP28]], align 8
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DEFAULT-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; DEFAULT: middle.block:
-; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]]
+; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; DEFAULT: scalar.ph:
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY:%.*]] ]
-; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; DEFAULT-NEXT: br label [[LOOP:%.*]]
; DEFAULT: vector.scevcheck:
; DEFAULT-NEXT: unreachable
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 92857197a7281..af57c5172fbd1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -9,11 +9,12 @@ define i64 @test(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'test'
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
-; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 8: 30
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 16: 56
; CHECK: LV: Selecting VF: 16
entry:
@@ -43,7 +44,6 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'test_external_iv_user'
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
-; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 8: 30
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
@@ -80,12 +80,13 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
-; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 8: 27
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 16: 48
; CHECK: LV: Selecting VF: 16
entry:
@@ -116,11 +117,12 @@ define i1 @test_extra_cmp_user(ptr nocapture noundef %dst, ptr nocapture noundef
; CHECK-LABEL: LV: Checking a loop in 'test_extra_cmp_user'
; CHECK: Cost of 4 for VF 8: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT: Cost of 4 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %indvars.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 12
+; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
+; CHECK: Cost for VF 8: 13
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 16: 4
; CHECK: LV: Selecting VF: 16
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 62186d5a20989..cbd6a40487704 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -450,66 +450,48 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
; PRED-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; PRED: vector.ph:
-; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
+; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
-; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
+; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 2
+; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 2
; PRED-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
-; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL]], i64 0
+; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
+; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; PRED-NEXT: [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT: [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64>
-; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; PRED-NEXT: [[TMP18:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; PRED-NEXT: [[TMP23:%.*]] = zext <2 x i32> [[TMP18]] to <2 x i64>
+; PRED-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
; PRED-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; PRED: pred.store.if:
-; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; PRED-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0
; PRED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
; PRED-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0
; PRED-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4
; PRED-NEXT: br label [[PRED_STORE_CONTINUE]]
; PRED: pred.store.continue:
-; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
+; PRED-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; PRED-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE7]]
; PRED: pred.store.if2:
-; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; PRED-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1
; PRED-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]]
; PRED-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1
; PRED-NEXT: store i32 [[TMP26]], ptr [[TMP25]], align 4
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE3]]
-; PRED: pred.store.continue3:
-; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; PRED-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
-; PRED: pred.store.if4:
-; PRED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
-; PRED-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]]
-; PRED-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2
-; PRED-NEXT: store i32 [[TMP30]], ptr [[TMP29]], align 4
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE5]]
-; PRED: pred.store.continue5:
-; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; PRED-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
-; PRED: pred.store.if6:
-; PRED-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
-; PRED-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]]
-; PRED-NEXT: [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3
-; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4
; PRED-NEXT: br label [[PRED_STORE_CONTINUE7]]
-; PRED: pred.store.continue7:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
-; PRED-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; PRED-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0
-; PRED-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; PRED: pred.store.continue3:
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP16]])
+; PRED-NEXT: [[TMP28:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; PRED-NEXT: [[TMP29:%.*]] = extractelement <2 x i1> [[TMP28]], i32 0
+; PRED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; PRED: scalar.ph:
@@ -647,66 +629,48 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
; PRED-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; PRED: vector.ph:
-; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
+; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
-; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4
+; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 2
+; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[ADD]], i64 0
+; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; PRED-NEXT: [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT: [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64>
-; PRED-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; PRED-NEXT: [[TMP17:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; PRED-NEXT: [[TMP22:%.*]] = zext <2 x i32> [[TMP17]] to <2 x i64>
+; PRED-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
; PRED-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; PRED: pred.store.if:
-; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
+; PRED-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP22]], i32 0
; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
; PRED-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0
; PRED-NEXT: store i32 [[TMP21]], ptr [[TMP20]], align 4
; PRED-NEXT: br label [[PRED_STORE_CONTINUE]]
; PRED: pred.store.continue:
-; PRED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; PRED-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; PRED-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE6]]
; PRED: pred.store.if1:
-; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
+; PRED-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP22]], i32 1
; PRED-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]]
; PRED-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1
; PRED-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE2]]
-; PRED: pred.store.continue2:
-; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; PRED-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
-; PRED: pred.store.if3:
-; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; PRED-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]]
-; PRED-NEXT: [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2
-; PRED-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4
-; PRED-NEXT: br label [[PRED_STORE_CONTINUE4]]
-; PRED: pred.store.continue4:
-; PRED-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; PRED-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
-; PRED: pred.store.if5:
-; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
-; PRED-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]]
-; PRED-NEXT: [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3
-; PRED-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
-; PRED: pred.store.continue6:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0
-; PRED-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; PRED: pred.store.continue2:
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
+; PRED-NEXT: [[TMP27:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; PRED-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP27]], i32 0
+; PRED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; PRED: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index a734a0e94198a..1fc87451cb060 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -2238,41 +2238,41 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
-; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP15]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
-; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
-; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
+; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = extractelement <vscale x 16 x i1> [[TMP19]], i32 0
-; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-MAXBW: middle.block:
; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
index ce8492cd77362..dad87826020fa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -183,55 +183,106 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
; CHECK-LABEL: define void @test_uniform_not_invariant
; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
-; CHECK-NEXT: [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT: [[GEPDST:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store double [[CALL]], ptr [[GEPDST]], align 8
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2)
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_CALL_CONTINUE2:%.*]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_CALL_CONTINUE2]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[TMP1]], i32 8, <2 x i1> [[ACTIVE_LANE_MASK]], <2 x double> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i64 0
+; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_CALL_IF:%.*]], label [[PRED_CALL_CONTINUE:%.*]]
+; CHECK: pred.call.if:
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[WIDE_MASKED_LOAD]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = call double @foo(double [[TMP3]], i64 [[INDEX]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT: br label [[PRED_CALL_CONTINUE]]
+; CHECK: pred.call.continue:
+; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x double> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_CALL_IF]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i64 1
+; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_CALL_IF1:%.*]], label [[PRED_CALL_CONTINUE2]]
+; CHECK: pred.call.if1:
+; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[WIDE_MASKED_LOAD]], i64 1
+; CHECK-NEXT: [[TMP10:%.*]] = call double @foo(double [[TMP9]], i64 [[TMP8]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP10]], i64 1
+; CHECK-NEXT: br label [[PRED_CALL_CONTINUE2]]
+; CHECK: pred.call.continue2:
+; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x double> [ [[TMP6]], [[PRED_CALL_CONTINUE]] ], [ [[TMP11]], [[PRED_CALL_IF1]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT: call void @llvm.masked.store.v2f64.p0(<2 x double> [[TMP12]], ptr [[TMP13]], i32 8, <2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP0]])
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
;
; INTERLEAVE-LABEL: define void @test_uniform_not_invariant
; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; INTERLEAVE-NEXT: entry:
-; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2)
-; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ne i64 [[N]], 0
-; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ugt i64 [[N]], 1
+; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 4)
+; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[N]])
+; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 2, i64 [[N]])
; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]]
; INTERLEAVE: vector.body:
-; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
-; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ true, [[PRED_STORE_CONTINUE4]] ]
-; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[PRED_STORE_CONTINUE4]] ]
-; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; INTERLEAVE: pred.store.if:
+; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_CALL_CONTINUE9:%.*]] ]
+; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_CALL_CONTINUE9]] ]
+; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[PRED_CALL_CONTINUE9]] ]
; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; INTERLEAVE-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP1]], align 8
-; INTERLEAVE-NEXT: [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR5:[0-9]+]]
-; INTERLEAVE-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; INTERLEAVE-NEXT: store double [[TMP3]], ptr [[TMP4]], align 8
-; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE]]
-; INTERLEAVE: pred.store.continue:
-; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK2]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
-; INTERLEAVE: pred.store.if3:
-; INTERLEAVE-NEXT: [[TMP5:%.*]] = or disjoint i64 [[INDEX]], 1
-; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP5]]
-; INTERLEAVE-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP6]], align 8
-; INTERLEAVE-NEXT: [[TMP8:%.*]] = call double @foo(double [[TMP7]], i64 [[TMP5]]) #[[ATTR5]]
-; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[TMP5]]
-; INTERLEAVE-NEXT: store double [[TMP8]], ptr [[TMP9]], align 8
-; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE4]]
-; INTERLEAVE: pred.store.continue4:
-; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; INTERLEAVE-NEXT: [[TMP10:%.*]] = or disjoint i64 [[INDEX]], 1
-; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]]
-; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = icmp ult i64 [[TMP10]], [[TMP0]]
-; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK_NEXT]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16
+; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[TMP1]], i32 8, <2 x i1> [[ACTIVE_LANE_MASK]], <2 x double> poison)
+; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[TMP2]], i32 8, <2 x i1> [[ACTIVE_LANE_MASK2]], <2 x double> poison)
+; INTERLEAVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i64 0
+; INTERLEAVE-NEXT: br i1 [[TMP3]], label [[PRED_CALL_IF:%.*]], label [[PRED_CALL_CONTINUE:%.*]]
+; INTERLEAVE: pred.call.if:
+; INTERLEAVE-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[WIDE_MASKED_LOAD]], i64 0
+; INTERLEAVE-NEXT: [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[INDEX]]) #[[ATTR5:[0-9]+]]
+; INTERLEAVE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
+; INTERLEAVE-NEXT: br label [[PRED_CALL_CONTINUE]]
+; INTERLEAVE: pred.call.continue:
+; INTERLEAVE-NEXT: [[TMP7:%.*]] = phi <2 x double> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_CALL_IF]] ]
+; INTERLEAVE-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i64 1
+; INTERLEAVE-NEXT: br i1 [[TMP8]], label [[PRED_CALL_IF4:%.*]], label [[PRED_CALL_CONTINUE5:%.*]]
+; INTERLEAVE: pred.call.if4:
+; INTERLEAVE-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 1
+; INTERLEAVE-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[WIDE_MASKED_LOAD]], i64 1
+; INTERLEAVE-NEXT: [[TMP11:%.*]] = call double @foo(double [[TMP10]], i64 [[TMP9]]) #[[ATTR5]]
+; INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP11]], i64 1
+; INTERLEAVE-NEXT: br label [[PRED_CALL_CONTINUE5]]
+; INTERLEAVE: pred.call.continue5:
+; INTERLEAVE-NEXT: [[TMP13:%.*]] = phi <2 x double> [ [[TMP7]], [[PRED_CALL_CONTINUE]] ], [ [[TMP12]], [[PRED_CALL_IF4]] ]
+; INTERLEAVE-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i64 0
+; INTERLEAVE-NEXT: br i1 [[TMP14]], label [[PRED_CALL_IF6:%.*]], label [[PRED_CALL_CONTINUE7:%.*]]
+; INTERLEAVE: pred.call.if6:
+; INTERLEAVE-NEXT: [[TMP15:%.*]] = or disjoint i64 [[INDEX]], 2
+; INTERLEAVE-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[WIDE_MASKED_LOAD3]], i64 0
+; INTERLEAVE-NEXT: [[TMP17:%.*]] = call double @foo(double [[TMP16]], i64 [[TMP15]]) #[[ATTR5]]
+; INTERLEAVE-NEXT: [[TMP18:%.*]] = insertelement <2 x double> poison, double [[TMP17]], i64 0
+; INTERLEAVE-NEXT: br label [[PRED_CALL_CONTINUE7]]
+; INTERLEAVE: pred.call.continue7:
+; INTERLEAVE-NEXT: [[TMP19:%.*]] = phi <2 x double> [ poison, [[PRED_CALL_CONTINUE5]] ], [ [[TMP18]], [[PRED_CALL_IF6]] ]
+; INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i64 1
+; INTERLEAVE-NEXT: br i1 [[TMP20]], label [[PRED_CALL_IF8:%.*]], label [[PRED_CALL_CONTINUE9]]
+; INTERLEAVE: pred.call.if8:
+; INTERLEAVE-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 3
+; INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[WIDE_MASKED_LOAD3]], i64 1
+; INTERLEAVE-NEXT: [[TMP23:%.*]] = call double @foo(double [[TMP22]], i64 [[TMP21]]) #[[ATTR5]]
+; INTERLEAVE-NEXT: [[TMP24:%.*]] = insertelement <2 x double> [[TMP19]], double [[TMP23]], i64 1
+; INTERLEAVE-NEXT: br label [[PRED_CALL_CONTINUE9]]
+; INTERLEAVE: pred.call.continue9:
+; INTERLEAVE-NEXT: [[TMP25:%.*]] = phi <2 x double> [ [[TMP19]], [[PRED_CALL_CONTINUE7]] ], [ [[TMP24]], [[PRED_CALL_IF8]] ]
+; INTERLEAVE-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP26]], i64 16
+; INTERLEAVE-NEXT: call void @llvm.masked.store.v2f64.p0(<2 x double> [[TMP13]], ptr [[TMP26]], i32 8, <2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT: call void @llvm.masked.store.v2f64.p0(<2 x double> [[TMP25]], ptr [[TMP27]], i32 8, <2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; INTERLEAVE-NEXT: [[TMP28:%.*]] = or disjoint i64 [[INDEX]], 2
+; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP0]])
+; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[TMP28]], i64 [[TMP0]])
+; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; INTERLEAVE-NEXT: br i1 [[TMP29]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; INTERLEAVE: for.cond.cleanup:
; INTERLEAVE-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
index b22910316d7cb..4ace22d6dd1ab 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -21,7 +21,6 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
; CHECK: LV: Scalar loop costs: 5.
; CHECK: Cost of 1 for VF 2: induction instruction %inc = add nuw nsw i32 %i.016, 1
; CHECK: Cost of 0 for VF 2: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n
; CHECK: Cost of 0 for VF 2: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 2: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
@@ -34,11 +33,10 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%arrayidx7>
; CHECK: Cost of 16 for VF 2: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
-; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
; CHECK: Cost for VF 2: 86 (Estimated cost per lane: 43.
; CHECK: Cost of 1 for VF 4: induction instruction %inc = add nuw nsw i32 %i.016, 1
; CHECK: Cost of 0 for VF 4: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n
; CHECK: Cost of 0 for VF 4: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 4: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
@@ -51,11 +49,10 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
; CHECK: Cost of 0 for VF 4: vp<%5> = vector-pointer ir<%arrayidx7>
; CHECK: Cost of 2 for VF 4: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
-; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%1>
; CHECK: Cost for VF 4: 10 (Estimated cost per lane: 2.
; CHECK: Cost of 1 for VF 8: induction instruction %inc = add nuw nsw i32 %i.016, 1
; CHECK: Cost of 0 for VF 8: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n
; CHECK: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 8: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
@@ -68,7 +65,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
; CHECK: Cost of 0 for VF 8: vp<%5> = vector-pointer ir<%arrayidx7>
; CHECK: Cost of 2 for VF 8: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
-; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<%1>
; CHECK: Cost for VF 8: 46 (Estimated cost per lane: 5.
; CHECK: LV: Selecting VF: 4.
define void @expensive_icmp(ptr noalias nocapture %d, ptr nocapture readonly %s, i32 %n, i16 zeroext %m) #0 {
@@ -134,7 +131,6 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 2: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
; CHECK: Cost of 0 for VF 2: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
; CHECK: Cost of 0 for VF 2: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: Cost of 1 for VF 2: exit condition instruction %cmp.not = icmp eq i32 %dec, 0
; CHECK: Cost of 0 for VF 2: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 2: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]>
@@ -156,7 +152,7 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1
; CHECK: Cost of 18 for VF 2: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4>
; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<%0>
-; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
+; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
; CHECK: Cost for VF 2: 130 (Estimated cost per lane: 65.
; CHECK: Cost of 1 for VF 4: induction instruction %dec = add i32 %blkCnt.012, -1
; CHECK: Cost of 0 for VF 4: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
@@ -166,7 +162,6 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 4: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
; CHECK: Cost of 0 for VF 4: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
; CHECK: Cost of 0 for VF 4: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: Cost of 1 for VF 4: exit condition instruction %cmp.not = icmp eq i32 %dec, 0
; CHECK: Cost of 0 for VF 4: EMIT vp<[[CAN_IV:%.]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 4: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]>
@@ -188,7 +183,7 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1
; CHECK: Cost of 2 for VF 4: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4>
; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<%0>
-; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
+; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
; CHECK: Cost for VF 4: 14 (Estimated cost per lane: 3.
; CHECK: Cost of 1 for VF 8: induction instruction %dec = add i32 %blkCnt.012, -1
; CHECK: Cost of 0 for VF 8: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
@@ -198,7 +193,6 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 8: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
; CHECK: Cost of 0 for VF 8: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
; CHECK: Cost of 0 for VF 8: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: Cost of 1 for VF 8: exit condition instruction %cmp.not = icmp eq i32 %dec, 0
; CHECK: Cost of 0 for VF 8: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 8: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]>
@@ -220,7 +214,7 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1
; CHECK: Cost of 2 for VF 8: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4>
; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}
-; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
+; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
; CHECK: Cost for VF 8: 26 (Estimated cost per lane: 3.
; CHECK: Cost of 1 for VF 16: induction instruction %dec = add i32 %blkCnt.012, -1
; CHECK: Cost of 0 for VF 16: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
@@ -230,7 +224,6 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 16: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
; CHECK: Cost of 0 for VF 16: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
; CHECK: Cost of 0 for VF 16: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: Cost of 1 for VF 16: exit condition instruction %cmp.not = icmp eq i32 %dec, 0
; CHECK: Cost of 0 for VF 16: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 16: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]>
@@ -252,7 +245,7 @@ for.inc: ; preds = %for.body, %if.then
; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1
; CHECK: Cost of 2 for VF 16: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4>
; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}>
-; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
+; CHECK: Cost of 1 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
; CHECK: Cost for VF 16: 50
; CHECK: LV: Selecting VF: 16.
define void @cheap_icmp(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 10ac870c112ae..346a27b608ccc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -181,32 +181,24 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP2]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 8)
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP9]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = shl <8 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP12]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
+; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i8> [[TMP2]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP5]], ptr [[TMP14]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index 3386a7d3972aa..556f35465a7a1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -6,26 +6,20 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 4, [[TMP1]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4:%.*]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP7]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
-; CHECK-NEXT: [[TMP6:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -61,28 +55,20 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 4, [[TMP2]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6:%.*]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT: [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
index bb85b88f181f7..ec01bd068f9fc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
@@ -9,7 +9,6 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst)
; CHECK-LABEL: 'wide_or_replaced_with_add_vpinstruction'
; CHECK: Cost of 1 for VF 2: induction instruction %iv.next = add nuw nsw i64 %iv, 1
; CHECK: Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32
; CHECK: Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0>
; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
@@ -23,11 +22,10 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst)
; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%g.dst>
; CHECK: Cost of 1 for VF 2: WIDEN store vp<%6>, ir<%iv.4>, ir<%c>
; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
-; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%2>
; CHECK: Cost of 0 for VF 2: vector loop backedge
; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1
; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32
; CHECK: Cost of 0 for VF 4: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0>
; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
@@ -41,11 +39,10 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst)
; CHECK: Cost of 0 for VF 4: vp<%6> = vector-pointer ir<%g.dst>
; CHECK: Cost of 1 for VF 4: WIDEN store vp<%6>, ir<%iv.4>, ir<%c>
; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
-; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%2>
; CHECK: Cost of 0 for VF 4: vector loop backedge
; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1
; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32
;
entry:
br label %loop.header
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index b11f72c886bdc..8362cec30109e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -335,7 +335,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 {
; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]]
; CHECK-NEXT: [[UMIN7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]])
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[UMIN7]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 28
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 14
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1)
@@ -1224,49 +1224,49 @@ define i32 @g(i64 %n) {
; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[N]], 4294967295
; CHECK-NEXT: br i1 [[TMP2]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP1]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP1]], 32
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 32
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
-; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
-; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64>
-; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[STEP_ADD_2]] to <4 x i64>
-; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i32> [[STEP_ADD_3]] to <4 x i64>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP5]]
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
-; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2)
-; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2)
-; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2)
-; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2)
-; CHECK-NEXT: [[TMP15]] = or <4 x i32> [[TMP11]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP16]] = or <4 x i32> [[TMP12]], [[VEC_PHI2]]
-; CHECK-NEXT: [[TMP17]] = or <4 x i32> [[TMP13]], [[VEC_PHI3]]
-; CHECK-NEXT: [[TMP18]] = or <4 x i32> [[TMP14]], [[VEC_PHI4]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <8 x i32> [[STEP_ADD]], splat (i32 8)
+; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <8 x i32> [[STEP_ADD_2]], splat (i32 8)
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[VEC_IND]] to <8 x i64>
+; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[STEP_ADD]] to <8 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i32> [[STEP_ADD_2]] to <8 x i64>
+; CHECK-NEXT: [[TMP6:%.*]] = zext <8 x i32> [[STEP_ADD_3]] to <8 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2)
+; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2)
+; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2)
+; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2)
+; CHECK-NEXT: [[TMP15]] = or <8 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP16]] = or <8 x i32> [[TMP12]], [[VEC_PHI2]]
+; CHECK-NEXT: [[TMP17]] = or <8 x i32> [[TMP13]], [[VEC_PHI3]]
+; CHECK-NEXT: [[TMP18]] = or <8 x i32> [[TMP14]], [[VEC_PHI4]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[STEP_ADD_3]], splat (i32 8)
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[TMP16]], [[TMP15]]
-; CHECK-NEXT: [[BIN_RDX5:%.*]] = or <4 x i32> [[TMP17]], [[BIN_RDX]]
-; CHECK-NEXT: [[BIN_RDX6:%.*]] = or <4 x i32> [[TMP18]], [[BIN_RDX5]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[BIN_RDX6]])
+; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP16]], [[TMP15]]
+; CHECK-NEXT: [[BIN_RDX5:%.*]] = or <8 x i32> [[TMP17]], [[BIN_RDX]]
+; CHECK-NEXT: [[BIN_RDX6:%.*]] = or <8 x i32> [[TMP18]], [[BIN_RDX5]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[BIN_RDX6]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
index 99d39f3e88983..4c9412361ddd4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
@@ -27,24 +27,31 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP10]], 1
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]]
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -7
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP28]], align 1
-; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i8> [[WIDE_LOAD]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP29]] = xor <8 x i8> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -3
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP26]], i32 -4
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 -3
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
+; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD4]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP17]] = xor <4 x i8> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP19]] = xor <4 x i8> [[REVERSE5]], [[VEC_PHI3]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP31:%.*]] = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> [[TMP29]])
+; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <4 x i8> [[TMP19]], [[TMP17]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> [[BIN_RDX]])
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP31]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
index 28f8988bd853a..13c692dba544f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -30,8 +30,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: br
; CHECK: Cost of 1 for VF 2: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
; CHECK: Cost of 1 for VF 2: induction instruction %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i32 %lftr.wideiv, %n
-; CHECK: Cost of 0 for VF 2: exit condition instruction %lftr.wideiv = trunc i64 %indvars.iv.next to i32
; CHECK: Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 1 for VF 2: WIDEN-REDUCTION-PHI ir<%sum.013> = phi ir<0>, vp<%8>
; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
@@ -49,7 +47,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%7> = trunc ir<%add5> to i8
; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%8> = zext vp<%7> to i32
; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%0>
-; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
;
define i8 @reduction_i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) {
entry:
>From 8d0c466caa3818f05637064f29e3c29114795c15 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Wed, 22 Jan 2025 15:39:51 -0800
Subject: [PATCH 2/4] [LV][VPlan] Implement VPlan-based cost for exit
condition.
This patch tried to model the cost of exit conditions through
vplan-based cost model.
* `BranchOnCount` will generate icmp + br.
The branch instruction is already implemented by the VPRegionBlock so
we only need to calculate the cost of icmp.
If the VF is same as the trip count of the loop, the cost of the
BranchOnCount is free.
This patch is not quite NFC for following reasons.
* Some of the BranchOnCount could be optimized to BranchOnCond, which is
free.
* Some of the instructions calculated in the exit condition in legacy
cost model will be optimized out.
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++--
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 -
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 03f76148db515..b1b3b65b8141a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7447,8 +7447,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
if (isa<VPPartialReductionRecipe>(&R))
return true;
- // The legacy cost model will under estimate the cost of BranchOnCount if exit condition were explicit contructed in the vplan.
- if (VPInstruction *VPI = dyn_cast<VPInstruction>(&R); VPI && VPI->getOpcode() == VPInstruction::BranchOnCount)
+ // The legacy cost model will under estimate the cost of BranchOnCount if
+ // exit condition were explicit contructed in the vplan.
+ if (VPInstruction *VPI = dyn_cast<VPInstruction>(&R);
+ VPI && VPI->getOpcode() == VPInstruction::BranchOnCount)
return true;
if (Instruction *UI = GetInstructionForCost(&R))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5516d6209a0ac..ae40147de2f60 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -744,7 +744,6 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::BranchOnCount: {
- if (getUnderlyingValue())
// BranchOnCount will genearte icmp_eq + br instructions and the
// cost of branch will be calculated in VPRegionBlock.
// If the vector loop only executed once, ignore the cost of the cmp.
>From feff37e3b752b2ecb12761c60ab0bc7e41dcdfa3 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Wed, 5 Feb 2025 00:59:04 -0800
Subject: [PATCH 3/4] !fixup typo and implement cost of BranchOnCond
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 28 +++++++++++++++++--
1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ae40147de2f60..86705e2e6b549 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -744,17 +744,39 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::BranchOnCount: {
- // BranchOnCount will genearte icmp_eq + br instructions and the
- // cost of branch will be calculated in VPRegionBlock.
- // If the vector loop only executed once, ignore the cost of the cmp.
Type *ValTy = Ctx.Types.inferScalarType(getOperand(0));
+
+ // If the vector loop only executed once, ignore the cost.
+ // TODO: We can remove this after hoist `unrollByUF` and
+ // `optimizeForVFandUF` which will should optimize BranchOnCount out.
auto TC = dyn_cast_if_present<ConstantInt>(
getParent()->getPlan()->getTripCount()->getUnderlyingValue());
if (TC && VF.isFixed() && TC->getSExtValue() == VF.getFixedValue())
return 0;
+
+ // BranchOnCount will generate icmp_eq + br instructions and the
+ // cost of branch will be calculated in VPRegionBlock.
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy, nullptr,
CmpInst::ICMP_EQ, Ctx.CostKind);
}
+ case VPInstruction::BranchOnCond: {
+ // BranchOnCond will generate `extractelement` when the condition is vector
+ // type.
+ VPValue *Op = getOperand(0);
+ VPRecipeBase *R = Op->getDefiningRecipe();
+ if (R &&
+ any_of(R->operands(), [&](VPValue *V) { return !R->usesScalars(V); }) &&
+ VF.isVector())
+ return Ctx.TTI.getVectorInstrCost(
+ Instruction::ExtractElement,
+ cast<VectorType>(
+ toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)),
+ Ctx.CostKind, 0, nullptr, nullptr);
+
+ // Otherwise, BranchOnCond is free since the branch cost is already
+ // calculated by VPBB.
+ return 0;
+ }
default:
// TODO: Compute cost other VPInstructions once the legacy cost model has
// been retired.
>From 915c3d49d58c6e70e142b7acf0fcd36f337a299e Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 9 Mar 2025 21:30:50 -0700
Subject: [PATCH 4/4] Update comments and testing scalar.
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 32 +++++++++----------
1 file changed, 15 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 86705e2e6b549..84df7e535f4b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -746,9 +746,10 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
case VPInstruction::BranchOnCount: {
Type *ValTy = Ctx.Types.inferScalarType(getOperand(0));
- // If the vector loop only executed once, ignore the cost.
+ // If the vector loop only executed once (VF == original trip count), ignore
+ // the cost of cmp.
// TODO: We can remove this after hoist `unrollByUF` and
- // `optimizeForVFandUF` which will should optimize BranchOnCount out.
+ // `optimizeForVFandUF` which will optimize BranchOnCount out.
auto TC = dyn_cast_if_present<ConstantInt>(
getParent()->getPlan()->getTripCount()->getUnderlyingValue());
if (TC && VF.isFixed() && TC->getSExtValue() == VF.getFixedValue())
@@ -760,22 +761,19 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
CmpInst::ICMP_EQ, Ctx.CostKind);
}
case VPInstruction::BranchOnCond: {
- // BranchOnCond will generate `extractelement` when the condition is vector
- // type.
- VPValue *Op = getOperand(0);
- VPRecipeBase *R = Op->getDefiningRecipe();
- if (R &&
- any_of(R->operands(), [&](VPValue *V) { return !R->usesScalars(V); }) &&
- VF.isVector())
- return Ctx.TTI.getVectorInstrCost(
- Instruction::ExtractElement,
- cast<VectorType>(
- toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)),
- Ctx.CostKind, 0, nullptr, nullptr);
-
- // Otherwise, BranchOnCond is free since the branch cost is already
+
+ // BranchOnCond is free since the branch cost is already
// calculated by VPBB.
- return 0;
+ if (vputils::onlyFirstLaneUsed(getOperand(0)))
+ return 0;
+
+ // Otherwise, BranchOnCond will generate `extractelement` to extract the
+ // condition from vector type.
+ return Ctx.TTI.getVectorInstrCost(
+ Instruction::ExtractElement,
+ cast<VectorType>(
+ toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)),
+ Ctx.CostKind, 0, nullptr, nullptr);
}
default:
// TODO: Compute cost other VPInstructions once the legacy cost model has
More information about the llvm-commits
mailing list