[llvm] 97687b7 - [VPlan] Add active-lane-mask as VPlan-to-VPlan transformation.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 25 05:35:02 PDT 2023
Author: Florian Hahn
Date: 2023-09-25T13:34:45+01:00
New Revision: 97687b7aea17cfc6e9c42d89b6f22ac5b9c7ddc1
URL: https://github.com/llvm/llvm-project/commit/97687b7aea17cfc6e9c42d89b6f22ac5b9c7ddc1
DIFF: https://github.com/llvm/llvm-project/commit/97687b7aea17cfc6e9c42d89b6f22ac5b9c7ddc1.diff
LOG: [VPlan] Add active-lane-mask as VPlan-to-VPlan transformation.
This patch updates the mask creation code to always create compares of
the form (ICMP_ULE, wide canonical IV, backedge-taken-count) up front
when tail folding and introduce active-lane-mask as later
transformation.
This effectively makes (ICMP_ULE, wide canonical IV, backedge-taken-count)
the canonical form for tail-folding early on. Introducing more specific
active-lane-mask recipes is treated as a VPlan-to-VPlan optimization.
This has the advantage of keeping the logic (and complexity) of
introducing active-lane-mask recipes in a single place, instead of
spreading the logic out across multiple functions. It also simplifies
initial VPlan construction and enables treating introducing EVL as
similar optimization.
Reviewed By: Ayal
Differential Revision: https://reviews.llvm.org/D158779
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlan.cpp
llvm/lib/Transforms/Vectorize/VPlan.h
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
llvm/lib/Transforms/Vectorize/VPlanTransforms.h
llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 6ef348d33567551..9691e1cd4f2ed00 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -66,6 +66,7 @@ class VPBuilder {
public:
VPBuilder() = default;
+ VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }
/// Clear the insertion point: created instructions will not be inserted into
/// a block.
@@ -143,6 +144,13 @@ class VPBuilder {
return createInstruction(Opcode, Operands, DL, Name);
}
+ VPInstruction *createOverflowingOp(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands,
+ VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
+ DebugLoc DL, const Twine &Name = "") {
+ return tryInsertInstruction(
+ new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
+ }
VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") {
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4ba8dc85df79214..cc17d91d4f43727 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8065,14 +8065,6 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
return;
}
- // If we're using the active lane mask for control flow, then we get the
- // mask from the active lane mask PHI that is cached in the VPlan.
- TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
- if (useActiveLaneMaskForControlFlow(TFStyle)) {
- BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi();
- return;
- }
-
// Introduce the early-exit compare IV <= BTC to form header block mask.
// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
// constructing the desired canonical IV in the header block as its first
@@ -8086,14 +8078,8 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
VPValue *BlockMask = nullptr;
- if (useActiveLaneMask(TFStyle)) {
- VPValue *TC = Plan.getTripCount();
- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
- nullptr, "active.lane.mask");
- } else {
- VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
- BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
- }
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
BlockMaskCache[Header] = BlockMask;
}
@@ -8631,8 +8617,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
// Add the necessary canonical IV and branch recipes required to control the
// loop.
-static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
- TailFoldingStyle Style) {
+static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
+ DebugLoc DL) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
@@ -8644,93 +8630,19 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
// Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
// IV by VF * UF.
- bool HasNUW = Style == TailFoldingStyle::None;
auto *CanonicalIVIncrement =
new VPInstruction(VPInstruction::CanonicalIVIncrement, {CanonicalIVPHI},
{HasNUW, false}, DL, "index.next");
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
- if (useActiveLaneMaskForControlFlow(Style)) {
- // Create the active lane mask instruction in the vplan preheader.
- VPBasicBlock *VecPreheader =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
-
- // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
- // we have to take unrolling into account. Each part needs to start at
- // Part * VF
- auto *CanonicalIVIncrementParts =
- new VPInstruction(VPInstruction::CanonicalIVIncrementForPart, {StartV},
- {HasNUW, false}, DL, "index.part.next");
- VecPreheader->appendRecipe(CanonicalIVIncrementParts);
-
- // Create the ActiveLaneMask instruction using the correct start values.
- VPValue *TC = Plan.getTripCount();
-
- VPValue *TripCount, *IncrementValue;
- if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
- // When avoiding a runtime check, the active.lane.mask inside the loop
- // uses a modified trip count and the induction variable increment is
- // done after the active.lane.mask intrinsic is called.
- auto *TCMinusVF =
- new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL);
- VecPreheader->appendRecipe(TCMinusVF);
- IncrementValue = CanonicalIVPHI;
- TripCount = TCMinusVF;
- } else {
- // When the loop is guarded by a runtime overflow check for the loop
- // induction variable increment by VF, we can increment the value before
- // the get.active.lane mask and use the unmodified tripcount.
- EB->appendRecipe(CanonicalIVIncrement);
- IncrementValue = CanonicalIVIncrement;
- TripCount = TC;
- }
+ EB->appendRecipe(CanonicalIVIncrement);
- auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
- {CanonicalIVIncrementParts, TC}, DL,
- "active.lane.mask.entry");
- VecPreheader->appendRecipe(EntryALM);
-
- // Now create the ActiveLaneMaskPhi recipe in the main loop using the
- // preheader ActiveLaneMask instruction.
- auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
- Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
-
- // Create the active lane mask for the next iteration of the loop.
- CanonicalIVIncrementParts =
- new VPInstruction(VPInstruction::CanonicalIVIncrementForPart,
- {IncrementValue}, {HasNUW, false}, DL);
- EB->appendRecipe(CanonicalIVIncrementParts);
-
- auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
- {CanonicalIVIncrementParts, TripCount}, DL,
- "active.lane.mask.next");
- EB->appendRecipe(ALM);
- LaneMaskPhi->addOperand(ALM);
-
- if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
- // Do the increment of the canonical IV after the active.lane.mask, because
- // that value is still based off %CanonicalIVPHI
- EB->appendRecipe(CanonicalIVIncrement);
- }
-
- // We have to invert the mask here because a true condition means jumping
- // to the exit block.
- auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
- EB->appendRecipe(NotMask);
-
- VPInstruction *BranchBack =
- new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
- EB->appendRecipe(BranchBack);
- } else {
- EB->appendRecipe(CanonicalIVIncrement);
-
- // Add the BranchOnCount VPInstruction to the latch.
- VPInstruction *BranchBack = new VPInstruction(
- VPInstruction::BranchOnCount,
- {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
- EB->appendRecipe(BranchBack);
- }
+ // Add the BranchOnCount VPInstruction to the latch.
+ VPInstruction *BranchBack =
+ new VPInstruction(VPInstruction::BranchOnCount,
+ {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
+ EB->appendRecipe(BranchBack);
}
// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
@@ -8817,8 +8729,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
- addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DL,
- CM.getTailFoldingStyle(IVUpdateMayOverflow));
+ TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
+ // When not folding the tail, we know that the induction increment will not
+ // overflow.
+ bool HasNUW = Style == TailFoldingStyle::None;
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
// Proactively create header mask. Masks for other blocks are created on
// demand.
@@ -8987,6 +8902,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
return nullptr;
+ if (useActiveLaneMask(Style)) {
+ // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
+ // TailFoldingStyle is visible there.
+ bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
+ bool WithoutRuntimeCheck =
+ Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+ VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
+ WithoutRuntimeCheck);
+ }
return Plan;
}
@@ -9021,8 +8945,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
Term->eraseFromParent();
- addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
- CM.getTailFoldingStyle());
+ // Tail folding is not supported for outer loops, so the induction increment
+ // is guaranteed to not wrap.
+ bool HasNUW = true;
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
+ DebugLoc());
return Plan;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index cf100cd000bb868..b4d464ad7ad8c7c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -718,15 +718,6 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {
return Plan;
}
-VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() {
- VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
- for (VPRecipeBase &R : Header->phis()) {
- if (isa<VPActiveLaneMaskPHIRecipe>(&R))
- return cast<VPActiveLaneMaskPHIRecipe>(&R);
- }
- return nullptr;
-}
-
void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
Value *CanonicalIVStartValue,
VPTransformState &State,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bc1033c011b8138..e65a7ab2cd028ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2682,10 +2682,6 @@ class VPlan {
return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
}
- /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there
- /// be only one at most. If there isn't one, then return nullptr.
- VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi();
-
void addLiveOut(PHINode *PN, VPValue *V);
void removeLiveOut(PHINode *PN) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 592063520a25b4b..692fecc2a87dd60 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -16,6 +16,7 @@
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/VectorUtils.h"
@@ -855,3 +856,152 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
removeRedundantExpandSCEVRecipes(Plan);
mergeBlocksIntoPredecessors(Plan);
}
+
+// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
+// the loop terminator with a branch-on-cond recipe with the negated
+// active-lane-mask as operand. Note that this turns the loop into an
+// uncountable one. Only the existing terminator is replaced, all other existing
+// recipes/users remain unchanged, except for poison-generating flags being
+// dropped from the canonical IV increment. Return the created
+// VPActiveLaneMaskPHIRecipe.
+//
+// The function uses the following definitions:
+//
+// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
+// calculate-trip-count-minus-VF (original TC) : original TC
+// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
+// CanonicalIVPhi : CanonicalIVIncrement
+// %StartV is the canonical induction start value.
+//
+// The function adds the following recipes:
+//
+// vector.ph:
+// %TripCount = calculate-trip-count-minus-VF (original TC)
+// [if DataWithControlFlowWithoutRuntimeCheck]
+// %EntryInc = canonical-iv-increment-for-part %StartV
+// %EntryALM = active-lane-mask %EntryInc, %TripCount
+//
+// vector.body:
+// ...
+// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
+// ...
+// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
+// %ALM = active-lane-mask %InLoopInc, TripCount
+// %Negated = Not %ALM
+// branch-on-cond %Negated
+//
+static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
+ VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
+ VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
+ auto *CanonicalIVPHI = Plan.getCanonicalIV();
+ VPValue *StartV = CanonicalIVPHI->getStartValue();
+
+ auto *CanonicalIVIncrement =
+ cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
+ // TODO: Check if dropping the flags is needed if
+ // !DataAndControlFlowWithoutRuntimeCheck.
+ CanonicalIVIncrement->dropPoisonGeneratingFlags();
+ DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
+ // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
+ // we have to take unrolling into account. Each part needs to start at
+ // Part * VF
+ auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
+ VPBuilder Builder(VecPreheader);
+
+ // Create the ActiveLaneMask instruction using the correct start values.
+ VPValue *TC = Plan.getTripCount();
+
+ VPValue *TripCount, *IncrementValue;
+ if (!DataAndControlFlowWithoutRuntimeCheck) {
+ // When the loop is guarded by a runtime overflow check for the loop
+ // induction variable increment by VF, we can increment the value before
+ // the get.active.lane mask and use the unmodified tripcount.
+ IncrementValue = CanonicalIVIncrement;
+ TripCount = TC;
+ } else {
+ // When avoiding a runtime check, the active.lane.mask inside the loop
+ // uses a modified trip count and the induction variable increment is
+ // done after the active.lane.mask intrinsic is called.
+ IncrementValue = CanonicalIVPHI;
+ TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
+ {TC}, DL);
+ }
+ auto *EntryIncrement = Builder.createOverflowingOp(
+ VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
+ "index.part.next");
+
+ // Create the active lane mask instruction in the VPlan preheader.
+ auto *EntryALM =
+ Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
+ DL, "active.lane.mask.entry");
+
+ // Now create the ActiveLaneMaskPhi recipe in the main loop using the
+ // preheader ActiveLaneMask instruction.
+ auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+ LaneMaskPhi->insertAfter(CanonicalIVPHI);
+
+ // Create the active lane mask for the next iteration of the loop before the
+ // original terminator.
+ VPRecipeBase *OriginalTerminator = EB->getTerminator();
+ Builder.setInsertPoint(OriginalTerminator);
+ auto *InLoopIncrement =
+ Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
+ {IncrementValue}, {false, false}, DL);
+ auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+ {InLoopIncrement, TripCount}, DL,
+ "active.lane.mask.next");
+ LaneMaskPhi->addOperand(ALM);
+
+ // Replace the original terminator with BranchOnCond. We have to invert the
+ // mask here because a true condition means jumping to the exit block.
+ auto *NotMask = Builder.createNot(ALM, DL);
+ Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
+ OriginalTerminator->eraseFromParent();
+ return LaneMaskPhi;
+}
+
+void VPlanTransforms::addActiveLaneMask(
+ VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
+ bool DataAndControlFlowWithoutRuntimeCheck) {
+ assert(!DataAndControlFlowWithoutRuntimeCheck ||
+ UseActiveLaneMaskForControlFlow &&
+ "DataAndControlFlowWithoutRuntimeCheck implies "
+ "UseActiveLaneMaskForControlFlow");
+
+ auto FoundWidenCanonicalIVUser =
+ find_if(Plan.getCanonicalIV()->users(),
+ [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+ assert(FoundWidenCanonicalIVUser &&
+ "Must have widened canonical IV when tail folding!");
+ auto *WideCanonicalIV =
+ cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
+ VPRecipeBase *LaneMask;
+ if (UseActiveLaneMaskForControlFlow) {
+ LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
+ Plan, DataAndControlFlowWithoutRuntimeCheck);
+ } else {
+ LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,
+ {WideCanonicalIV, Plan.getTripCount()},
+ nullptr, "active.lane.mask");
+ LaneMask->insertAfter(WideCanonicalIV);
+ }
+
+ // Walk users of WideCanonicalIV and replace all compares of the form
+ // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
+ // active-lane-mask.
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
+ auto *CompareToReplace = dyn_cast<VPInstruction>(U);
+ if (!CompareToReplace ||
+ CompareToReplace->getOpcode() != Instruction::ICmp ||
+ CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
+ CompareToReplace->getOperand(1) != BTC)
+ continue;
+
+ assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
+ "WidenCanonicalIV must be the first operand of the compare");
+ CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue());
+ CompareToReplace->eraseFromParent();
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5bf778a799b8431..6d36ac41db9fb52 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -67,6 +67,18 @@ struct VPlanTransforms {
/// regions until no improvements are remaining.
static void createAndOptimizeReplicateRegions(VPlan &Plan);
+ /// Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an
+ /// (active-lane-mask recipe, wide canonical IV, trip-count). If \p
+ /// UseActiveLaneMaskForControlFlow is true, introduce an
+ /// VPActiveLaneMaskPHIRecipe. If \p DataAndControlFlowWithoutRuntimeCheck is
+ /// true, no minimum-iteration runtime check will be created (during skeleton
+ /// creation) and instead it is handled using active-lane-mask. \p
+ /// DataAndControlFlowWithoutRuntimeCheck implies \p
+ /// UseActiveLaneMaskForControlFlow.
+ static void addActiveLaneMask(VPlan &Plan,
+ bool UseActiveLaneMaskForControlFlow,
+ bool DataAndControlFlowWithoutRuntimeCheck);
+
private:
/// Remove redundant VPBasicBlocks by merging them into their predecessor if
/// the predecessor has a single successor.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 6ff60220c76f978..f045f486eac27f6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -153,10 +153,10 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP12]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP13]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8
; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]]
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = extractelement <vscale x 8 x i1> [[TMP17]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -397,34 +397,34 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]]
-; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16
-; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]]
-; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24
-; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]]
-; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]]
-; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]]
-; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]]
-; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]]
-; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0
-; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]]
-; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]]
-; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0
-; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]]
-; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]]
-; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]]
+; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
+; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]]
+; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
+; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]]
+; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]]
+; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
+; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16
+; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
+; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24
+; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]])
@@ -480,21 +480,21 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP68]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], <vscale x 8 x float> [[TMP67]])
; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 8
-; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = add i64 [[INDEX]], [[TMP70]]
-; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 16
-; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = add i64 [[INDEX]], [[TMP73]]
-; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = mul i64 [[TMP75]], 24
-; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = add i64 [[INDEX]], [[TMP76]]
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP71]], i64 [[TMP20]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP74]], i64 [[TMP25]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP77]], i64 [[TMP30]])
-; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = mul i64 [[TMP78]], 32
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP79]]
+; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 32
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP70]]
+; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 8
+; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = add i64 [[INDEX]], [[TMP72]]
+; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 16
+; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], [[TMP75]]
+; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 24
+; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = add i64 [[INDEX]], [[TMP78]]
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP73]], i64 [[TMP14]])
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP76]], i64 [[TMP19]])
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP79]], i64 [[TMP24]])
; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT12]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT13]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
@@ -755,10 +755,10 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
; CHECK-ORDERED-TF-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP18]])
; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP16]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], <vscale x 4 x float> [[TMP20]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4
; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = extractelement <vscale x 4 x i1> [[TMP24]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -1012,10 +1012,10 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP16]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP17]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4
; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]]
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[TMP21]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -1258,10 +1258,10 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP18]]
; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP20]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4
; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = extractelement <vscale x 4 x i1> [[TMP24]], i32 0
; CHECK-ORDERED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1709,34 +1709,34 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]]
-; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16
-; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]]
-; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24
-; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]]
-; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]]
-; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]]
-; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]]
-; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]]
-; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0
-; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]]
-; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]]
-; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0
-; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]]
-; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]]
-; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]]
+; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
+; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]]
+; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
+; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]]
+; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]]
+; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
+; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16
+; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
+; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24
+; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]])
@@ -1814,21 +1814,21 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP78]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], <vscale x 8 x float> [[TMP85]])
; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 8
-; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = add i64 [[INDEX]], [[TMP88]]
-; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = mul i64 [[TMP90]], 16
-; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = add i64 [[INDEX]], [[TMP91]]
-; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = mul i64 [[TMP93]], 24
-; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = add i64 [[INDEX]], [[TMP94]]
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP89]], i64 [[TMP20]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP92]], i64 [[TMP25]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP95]], i64 [[TMP30]])
-; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = mul i64 [[TMP96]], 32
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP97]]
+; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]]
+; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8
+; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]]
+; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16
+; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = add i64 [[INDEX]], [[TMP93]]
+; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 24
+; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]]
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP14]])
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP19]])
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP24]])
; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT17]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
@@ -2123,34 +2123,34 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]]
-; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16
-; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]]
-; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 24
-; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]]
-; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[TMP12]]
-; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[N]], [[TMP12]]
-; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = sub i64 [[N]], [[TMP17]]
-; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[N]], [[TMP17]]
-; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0
-; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = sub i64 [[N]], [[TMP22]]
-; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[N]], [[TMP22]]
-; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0
-; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 32
-; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = sub i64 [[N]], [[TMP27]]
-; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[N]], [[TMP27]]
-; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]]
+; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
+; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]]
+; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]]
+; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 32
+; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]]
+; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]]
+; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0
+; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
+; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16
+; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
+; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24
+; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]])
@@ -2228,21 +2228,21 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP78]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], <vscale x 8 x float> [[TMP85]])
; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 8
-; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = add i64 [[INDEX]], [[TMP88]]
-; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = mul i64 [[TMP90]], 16
-; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = add i64 [[INDEX]], [[TMP91]]
-; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = mul i64 [[TMP93]], 24
-; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = add i64 [[INDEX]], [[TMP94]]
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP89]], i64 [[TMP20]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP92]], i64 [[TMP25]])
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP95]], i64 [[TMP30]])
-; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = mul i64 [[TMP96]], 32
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP97]]
+; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]]
+; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8
+; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]]
+; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16
+; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = add i64 [[INDEX]], [[TMP93]]
+; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 24
+; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]]
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP14]])
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP19]])
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP24]])
; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT17]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index d074507bee629e0..83523545f84d0ab 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -146,10 +146,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -312,10 +312,10 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP13]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -487,10 +487,10 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = shl i32 [[TMP16]], 4
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index 887fe0d4bf0357f..2ca5fbebed87721 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -17,8 +17,8 @@ target triple = "aarch64-unknown-linux-gnu"
; VPLANS-NEXT: No successors
; VPLANS-EMPTY:
; VPLANS-NEXT: vector.ph:
-; VPLANS-NEXT: EMIT vp<[[VF:%[0-9]+]]> = VF * Part + ir<0>
; VPLANS-NEXT: EMIT vp<[[NEWTC:%[0-9]+]]> = TC > VF ? TC - VF : 0 vp<[[TC]]>
+; VPLANS-NEXT: EMIT vp<[[VF:%[0-9]+]]> = VF * Part + ir<0>
; VPLANS-NEXT: EMIT vp<[[LANEMASK_ENTRY:%[0-9]+]]> = active lane mask vp<[[VF]]>, vp<[[TC]]>
; VPLANS-NEXT: Successor(s): vector loop
; VPLANS-EMPTY:
@@ -29,9 +29,9 @@ target triple = "aarch64-unknown-linux-gnu"
; VPLANS-NEXT: vp<[[STEP:%[0-9]+]]> = SCALAR-STEPS vp<[[INDV]]>, ir<1>
; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEP]]>
; VPLANS-NEXT: WIDEN store ir<%gep>, ir<%val>, vp<[[LANEMASK_PHI]]>
+; VPLANS-NEXT: EMIT vp<[[INDV_UPDATE:%[0-9]+]]> = VF * UF + vp<[[INDV]]>
; VPLANS-NEXT: EMIT vp<[[INC:%[0-9]+]]> = VF * Part + vp<[[INDV]]>
; VPLANS-NEXT: EMIT vp<[[LANEMASK_LOOP]]> = active lane mask vp<[[INC]]>, vp<[[NEWTC]]>
-; VPLANS-NEXT: EMIT vp<[[INDV_UPDATE:%[0-9]+]]> = VF * UF + vp<[[INDV]]>
; VPLANS-NEXT: EMIT vp<[[NOT:%[0-9]+]]> = not vp<[[LANEMASK_LOOP]]>
; VPLANS-NEXT: EMIT branch-on-cond vp<[[NOT]]>
; VPLANS-NEXT: No successors
@@ -67,10 +67,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
index 45fca1669389dcf..c0e736399f2932d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
@@ -89,10 +89,10 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src,
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 42, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
index bd8dbbb8ca443a9..7e0f18b9efae1b4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
@@ -37,10 +37,10 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP14]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -95,10 +95,10 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
; CHECK-IN-LOOP-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]]
-; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
+; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
; CHECK-IN-LOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -169,10 +169,10 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -225,10 +225,10 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-IN-LOOP-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
-; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
; CHECK-IN-LOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -306,10 +306,10 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
; CHECK-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> zeroinitializer
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[TMP20]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP23:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <vscale x 4 x i1> [[TMP23]], i32 0
; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -376,10 +376,10 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i32> zeroinitializer
; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP17]])
; CHECK-IN-LOOP-NEXT: [[TMP19]] = xor i32 [[TMP18]], [[VEC_PHI]]
-; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-IN-LOOP-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4
; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]]
+; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-IN-LOOP-NEXT: [[TMP22:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-IN-LOOP-NEXT: [[TMP23:%.*]] = extractelement <vscale x 4 x i1> [[TMP22]], i32 0
; CHECK-IN-LOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index 231fc0d074903db..caeb37bce2f18a1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -19,34 +19,34 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8
-; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]]
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 12
-; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16
-; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[UMAX]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[UMAX]], [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16
-; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[UMAX]], [[TMP17]]
-; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[UMAX]], [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0
-; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16
-; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[UMAX]], [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[UMAX]], [[TMP22]]
-; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0
-; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 16
-; CHECK-NEXT: [[TMP28:%.*]] = sub i64 [[UMAX]], [[TMP27]]
-; CHECK-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[UMAX]], [[TMP27]]
-; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
+; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[UMAX]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16
+; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[UMAX]], [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
+; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[UMAX]], [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[UMAX]], [[TMP21]]
+; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0
+; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 12
+; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]])
@@ -55,11 +55,11 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0
; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4
@@ -95,25 +95,25 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 4
-; CHECK-NEXT: [[TMP63:%.*]] = add i64 [[INDEX6]], [[TMP62]]
-; CHECK-NEXT: [[TMP64:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP65:%.*]] = mul i64 [[TMP64]], 8
-; CHECK-NEXT: [[TMP66:%.*]] = add i64 [[INDEX6]], [[TMP65]]
-; CHECK-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP68:%.*]] = mul i64 [[TMP67]], 12
-; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[INDEX6]], [[TMP68]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP15]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP63]], i64 [[TMP20]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT11]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP66]], i64 [[TMP25]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP69]], i64 [[TMP30]])
-; CHECK-NEXT: [[TMP70:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP70]], 16
-; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP71]]
+; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 16
+; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]]
+; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 4
+; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX6]], [[TMP64]]
+; CHECK-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8
+; CHECK-NEXT: [[TMP68:%.*]] = add i64 [[INDEX6]], [[TMP67]]
+; CHECK-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 12
+; CHECK-NEXT: [[TMP71:%.*]] = add i64 [[INDEX6]], [[TMP70]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]])
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT11]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP65]], i64 [[TMP14]])
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP68]], i64 [[TMP19]])
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP71]], i64 [[TMP24]])
; CHECK-NEXT: [[TMP72:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP73:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT10]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP74:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT11]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP75:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT12]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP73:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT11]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP74:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT12]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP75:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP76:%.*]] = extractelement <vscale x 4 x i1> [[TMP72]], i32 0
; CHECK-NEXT: br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -161,34 +161,34 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8
-; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]]
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 12
-; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16
-; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[UMAX]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[UMAX]], [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16
-; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[UMAX]], [[TMP17]]
-; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt i64 [[UMAX]], [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 0
-; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16
-; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[UMAX]], [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = icmp ugt i64 [[UMAX]], [[TMP22]]
-; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 0
-; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 16
-; CHECK-NEXT: [[TMP28:%.*]] = sub i64 [[UMAX]], [[TMP27]]
-; CHECK-NEXT: [[TMP29:%.*]] = icmp ugt i64 [[UMAX]], [[TMP27]]
-; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i64 [[TMP28]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
+; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[UMAX]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16
+; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[UMAX]], [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
+; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[UMAX]], [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[UMAX]], [[TMP21]]
+; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0
+; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
+; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
+; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 12
+; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]])
@@ -197,11 +197,11 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0
; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4
@@ -263,25 +263,25 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
; CHECK-NEXT: [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, <vscale x 4 x i1> [[TMP72]])
; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 4
-; CHECK-NEXT: [[TMP85:%.*]] = add i64 [[INDEX6]], [[TMP84]]
-; CHECK-NEXT: [[TMP86:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP87:%.*]] = mul i64 [[TMP86]], 8
-; CHECK-NEXT: [[TMP88:%.*]] = add i64 [[INDEX6]], [[TMP87]]
-; CHECK-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 12
-; CHECK-NEXT: [[TMP91:%.*]] = add i64 [[INDEX6]], [[TMP90]]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP15]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP85]], i64 [[TMP20]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP88]], i64 [[TMP25]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP91]], i64 [[TMP30]])
-; CHECK-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16
-; CHECK-NEXT: [[INDEX_NEXT16]] = add i64 [[INDEX6]], [[TMP93]]
+; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 16
+; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP84]]
+; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP86:%.*]] = mul i64 [[TMP85]], 4
+; CHECK-NEXT: [[TMP87:%.*]] = add i64 [[INDEX6]], [[TMP86]]
+; CHECK-NEXT: [[TMP88:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP89:%.*]] = mul i64 [[TMP88]], 8
+; CHECK-NEXT: [[TMP90:%.*]] = add i64 [[INDEX6]], [[TMP89]]
+; CHECK-NEXT: [[TMP91:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP92:%.*]] = mul i64 [[TMP91]], 12
+; CHECK-NEXT: [[TMP93:%.*]] = add i64 [[INDEX6]], [[TMP92]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]])
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP87]], i64 [[TMP14]])
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP90]], i64 [[TMP19]])
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP24]])
; CHECK-NEXT: [[TMP94:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP95:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP96:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT14]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP97:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT15]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP95:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT14]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP96:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT15]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP97:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP98:%.*]] = extractelement <vscale x 4 x i1> [[TMP94]], i32 0
; CHECK-NEXT: br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index f302ff24d08f221..292bdd5c8be85ae 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -34,10 +34,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -95,8 +95,8 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]])
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -162,10 +162,10 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -246,10 +246,10 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]])
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]])
; CHECK-NEXT: [[TMP23:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <vscale x 4 x i1> [[TMP23]], i32 0
@@ -321,10 +321,10 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -399,10 +399,10 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -483,10 +483,10 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
; CHECK-NEXT: [[TMP18:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP16]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[TMP18]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP22:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <vscale x 4 x i1> [[TMP22]], i32 0
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -574,10 +574,10 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -648,10 +648,10 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT: [[TMP15:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -729,10 +729,10 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP16:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP15]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4
; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP18]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[TMP19]], i32 0
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index e99d0a15d1963e7..187f50f2e76a480 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -25,8 +25,8 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr %dst, i64 [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]])
; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4
+; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]])
; CHECK-NEXT: [[NOT_ACTIVE_LANE_MASK:%.*]] = xor <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = extractelement <4 x i1> [[NOT_ACTIVE_LANE_MASK]], i32 0
; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %middle.block, label %vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index b7915249c06e5e3..814d3ba53050e31 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -233,10 +233,10 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
+; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
More information about the llvm-commits
mailing list