[llvm] [VPlan] Create epilogue minimum iteration check in VPlan. (PR #157545)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 21 03:12:56 PDT 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/157545
>From a43700384c4815ac799a8fcc0934270fc77a4940 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 29 Aug 2025 09:42:49 +0100
Subject: [PATCH 1/2] [VPlan] Create epilogue miniimum iteration check in
VPlan.
Move creation of the minimum iteration check for the epilogue vector
loop to VPlan. This is a first step towards breaking up and moving skeleton
creation for epilogue vectorization to VPlan.
It moves most logic out of EpilogueVectorizerEpilogueLoop: the minimum
iteration check is created directly in VPlan, connecting the check
blocks from the main vector loop is done as post-processing. Next steps
are to move connecting and updating the branches from the check blocks
to VPlan, as well as updating the incoming values for phis.
Test changes are improvements due to folding of live-ins.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 292 ++++++++++--------
.../AArch64/deterministic-type-shrinkage.ll | 3 +-
.../AArch64/epilog-iv-select-cmp.ll | 6 +-
.../AArch64/epilog-vectorization-factors.ll | 12 +-
.../epilog-vectorization-widen-inductions.ll | 9 +-
.../AArch64/f128-fmuladd-reduction.ll | 3 +-
.../AArch64/force-target-instruction-cost.ll | 3 +-
.../AArch64/interleave-with-gaps.ll | 8 +-
.../AArch64/interleaving-load-store.ll | 6 +-
.../AArch64/interleaving-reduction.ll | 3 +-
.../LoopVectorize/AArch64/intrinsiccost.ll | 6 +-
.../AArch64/loop-vectorization-factors.ll | 15 +-
.../AArch64/low_trip_count_predicates.ll | 6 +-
.../partial-reduce-dot-product-epilogue.ll | 3 +-
.../LoopVectorize/AArch64/store-costs-sve.ll | 3 +-
.../sve-epilog-vect-inloop-reductions.ll | 3 +-
.../AArch64/sve-epilog-vect-reductions.ll | 3 +-
.../sve-epilog-vect-strict-reductions.ll | 3 +-
.../LoopVectorize/AArch64/sve-epilog-vect.ll | 27 +-
.../AArch64/sve-epilog-vscale-fixed.ll | 8 +-
.../AArch64/sve2-histcnt-epilogue.ll | 4 +-
...-narrow-interleave-to-widen-memory-cost.ll | 6 +-
...ctor-loop-backedge-elimination-epilogue.ll | 3 +-
.../LoopVectorize/PowerPC/exit-branch-cost.ll | 3 +-
.../PowerPC/optimal-epilog-vectorization.ll | 12 +-
.../LoopVectorize/PowerPC/small-loop-rdx.ll | 3 +-
.../LoopVectorize/X86/conversion-cost.ll | 3 +-
.../X86/cost-conditional-branches.ll | 3 +-
.../LoopVectorize/X86/cost-model.ll | 6 +-
.../X86/epilog-vectorization-inductions.ll | 8 +-
.../LoopVectorize/X86/float-induction-x86.ll | 6 +-
.../LoopVectorize/X86/gather_scatter.ll | 3 +-
.../LoopVectorize/X86/induction-costs.ll | 3 +-
.../LoopVectorize/X86/intrinsiccost.ll | 6 +-
.../X86/invariant-load-gather.ll | 3 +-
.../X86/invariant-store-vectorization.ll | 9 +-
.../LoopVectorize/X86/masked_load_store.ll | 18 +-
.../Transforms/LoopVectorize/X86/pr23997.ll | 3 +-
.../Transforms/LoopVectorize/X86/pr54634.ll | 9 +-
.../LoopVectorize/X86/scatter_crash.ll | 6 +-
.../X86/vectorize-force-tail-with-evl.ll | 3 +-
41 files changed, 242 insertions(+), 300 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d78e190e8bf7b..79613a78f12f8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -688,11 +688,6 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
// vectorization of *epilogue* loops in the process of vectorizing loops and
// their epilogues.
class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
- /// The additional bypass block which conditionally skips over the epilogue
- /// loop after executing the main loop. Needed to resume inductions and
- /// reductions during epilogue vectorization.
- BasicBlock *AdditionalBypassBlock = nullptr;
-
public:
EpilogueVectorizerEpilogueLoop(
Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
@@ -703,20 +698,12 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
BFI, PSI, Checks, Plan, EPI.EpilogueVF,
EPI.EpilogueVF, EPI.EpilogueUF) {
- TripCount = EPI.TripCount;
+ TripCount = nullptr;
}
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
BasicBlock *createVectorizedLoopSkeleton() final;
- /// Return the additional bypass block which targets the scalar loop by
- /// skipping the epilogue loop after completing the main loop.
- BasicBlock *getAdditionalBypassBlock() const {
- assert(AdditionalBypassBlock &&
- "Trying to access AdditionalBypassBlock but it has not been set");
- return AdditionalBypassBlock;
- }
-
protected:
/// Emits an iteration count bypass check after the main vector loop has
/// finished to see if there are any iterations left to execute by either
@@ -7361,121 +7348,22 @@ BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
+ BasicBlock *Entry = OrigLoop->getLoopPreheader();
BasicBlock *ScalarPH = createScalarPreheader("vec.epilog.");
- BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
- // Now, compare the remaining count and if there aren't enough iterations to
- // execute the vectorized epilogue skip to the scalar part.
- VectorPH->setName("vec.epilog.ph");
- BasicBlock *VecEpilogueIterationCountCheck =
- SplitBlock(VectorPH, VectorPH->begin(), DT, LI, nullptr,
- "vec.epilog.iter.check", true);
- VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH);
-
- emitMinimumVectorEpilogueIterCountCheck(VectorPH, ScalarPH,
- VecEpilogueIterationCountCheck);
- AdditionalBypassBlock = VecEpilogueIterationCountCheck;
-
- // Adjust the control flow taking the state info from the main loop
- // vectorization into account.
- assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
- "expected this to be saved from the previous pass.");
- EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, VectorPH);
-
- EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, ScalarPH);
-
- // Adjust the terminators of runtime check blocks and phis using them.
- BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
- BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
- if (SCEVCheckBlock)
- SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, ScalarPH);
- if (MemCheckBlock)
- MemCheckBlock->getTerminator()->replaceUsesOfWith(
- VecEpilogueIterationCountCheck, ScalarPH);
-
- DT->changeImmediateDominator(ScalarPH, EPI.EpilogueIterationCountCheck);
-
- // The vec.epilog.iter.check block may contain Phi nodes from inductions or
- // reductions which merge control-flow from the latch block and the middle
- // block. Update the incoming values here and move the Phi into the preheader.
- SmallVector<PHINode *, 4> PhisInBlock(
- llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
-
- for (PHINode *Phi : PhisInBlock) {
- Phi->moveBefore(VectorPH->getFirstNonPHIIt());
- Phi->replaceIncomingBlockWith(
- VecEpilogueIterationCountCheck->getSinglePredecessor(),
- VecEpilogueIterationCountCheck);
-
- // If the phi doesn't have an incoming value from the
- // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
- // value and also those from other check blocks. This is needed for
- // reduction phis only.
- if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
- return EPI.EpilogueIterationCountCheck == IncB;
- }))
+ ScalarPH->getSinglePredecessor()->setName("vec.epilog.iter.check");
+ VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Entry);
+ VPBasicBlock *OldEntry = Plan.getEntry();
+ for (auto &R : make_early_inc_range(
+ make_range(OldEntry->getFirstNonPhi(), OldEntry->end()))) {
+ if (isa<VPIRInstruction>(&R))
continue;
- Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
- if (SCEVCheckBlock)
- Phi->removeIncomingValue(SCEVCheckBlock);
- if (MemCheckBlock)
- Phi->removeIncomingValue(MemCheckBlock);
+ R.moveBefore(*NewEntry, NewEntry->end());
}
- return VectorPH;
-}
-
-BasicBlock *
-EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
- BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert) {
-
- assert(EPI.TripCount &&
- "Expected trip count to have been saved in the first pass.");
- Value *TC = EPI.TripCount;
- IRBuilder<> Builder(Insert->getTerminator());
- Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
-
- // Generate code to check if the loop's trip count is less than VF * UF of the
- // vector epilogue loop.
- auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
- ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
-
- Value *CheckMinIters =
- Builder.CreateICmp(P, Count,
- createStepForVF(Builder, Count->getType(),
- EPI.EpilogueVF, EPI.EpilogueUF),
- "min.epilog.iters.check");
-
- BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
- auto VScale = Cost->getVScaleForTuning();
- unsigned MainLoopStep =
- estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
- unsigned EpilogueLoopStep =
- estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
- // We assume the remaining `Count` is equally distributed in
- // [0, MainLoopStep)
- // So the probability for `Count < EpilogueLoopStep` should be
- // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
- // TODO: Improve the estimate by taking the estimated trip count into
- // consideration.
- unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
- const uint32_t Weights[] = {EstimatedSkipCount,
- MainLoopStep - EstimatedSkipCount};
- setBranchWeights(BI, Weights, /*IsExpected=*/false);
- ReplaceInstWithInst(Insert->getTerminator(), &BI);
-
- // A new entry block has been created for the epilogue VPlan. Hook it in, as
- // otherwise we would try to modify the entry to the main vector loop.
- VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
- VPBasicBlock *OldEntry = Plan.getEntry();
VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
Plan.setEntry(NewEntry);
- // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
- return Insert;
+ return ScalarPH->getSinglePredecessor();
}
void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
@@ -9555,10 +9443,11 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
-static void
-preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
- const SCEV2ValueTy &ExpandedSCEVs,
- EpilogueLoopVectorizationInfo &EPI) {
+static void preparePlanForEpilogueVectorLoop(
+ VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
+ EpilogueLoopVectorizationInfo &EPI,
+ SmallVectorImpl<Instruction *> &InstsToMove, LoopVectorizationCostModel &CM,
+ ScalarEvolution &SE) {
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
@@ -9633,6 +9522,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
+ if (auto *I = dyn_cast<Instruction>(ResumeV))
+ InstsToMove.push_back(I);
} else if (RecurrenceDescriptor::isFindIVRecurrenceKind(RK)) {
Value *StartV = getStartValueFromReductionResult(RdxResult);
ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
@@ -9647,8 +9538,12 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
+ if (auto *I = dyn_cast<Instruction>(Cmp))
+ InstsToMove.push_back(I);
Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
+ if (auto *I = dyn_cast<Instruction>(ResumeV))
+ InstsToMove.push_back(I);
} else {
VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
@@ -9700,6 +9595,46 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
Plan.resetTripCount(ExpandedVal);
ExpandR->eraseFromParent();
}
+
+ // Add the minimum iteration check for the epilogue vector loop.
+ VPValue *TC = Plan.getOrAddLiveIn(EPI.TripCount);
+ VPBuilder Builder(cast<VPBasicBlock>(Plan.getEntry()));
+ VPValue *Count = Builder.createNaryOp(
+ Instruction::Sub, {TC, Plan.getOrAddLiveIn(EPI.VectorTripCount)},
+ DebugLoc::getUnknown(), "n.vec.remaining");
+
+ // Generate code to check if the loop's trip count is less than VF * UF of
+ // the vector epilogue loop.
+ auto P = CM.requiresScalarEpilogue(EPI.EpilogueVF.isVector())
+ ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
+ VPValue *VFxUF = Builder.createExpandSCEV(
+ SE.getElementCount(EPI.TripCount->getType(),
+ (EPI.EpilogueVF * EPI.EpilogueUF), SCEV::FlagNUW));
+
+ auto *CheckMinIters = Builder.createICmp(
+ P, Count, VFxUF, DebugLoc::getUnknown(), "min.epilog.iters.check");
+ VPInstruction *Branch =
+ Builder.createNaryOp(VPInstruction::BranchOnCond, CheckMinIters);
+
+ auto VScale = CM.getVScaleForTuning();
+ unsigned MainLoopStep =
+ estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
+ unsigned EpilogueLoopStep =
+ estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
+ // We assume the remaining `Count` is equally distributed in
+ // [0, MainLoopStep)
+ // So the probability for `Count < EpilogueLoopStep` should be
+ // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
+ // TODO: Improve the estimate by taking the estimated trip count into
+ // consideration.
+ unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
+ const uint32_t Weights[] = {EstimatedSkipCount,
+ MainLoopStep - EstimatedSkipCount};
+ MDBuilder MDB(Plan.getContext());
+ MDNode *BranchWeights =
+ MDB.createBranchWeights(Weights, /*IsExpected=*/false);
+ Branch->addMetadata(LLVMContext::MD_prof, BranchWeights);
}
// Generate bypass values from the additional bypass block. Note that when the
@@ -9766,6 +9701,98 @@ static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
}
}
+/// Connect the epilogue vector loop generated for \p Plan to the main vector
+/// loop, updating branches from the iteration and runtime checks, as well as
+/// updating various phis.
+static void connectEpilogueVectorLoop(
+ VPlan &Plan, Loop *L, EpilogueLoopVectorizationInfo &EPI, DominatorTree *DT,
+ LoopVectorizationLegality &LVL,
+ DenseMap<const SCEV *, Value *> &ExpandedSCEVs, GeneratedRTChecks &Checks,
+ ArrayRef<Instruction *> InstsToMove) {
+ BasicBlock *AdditionalBypassBlock =
+ cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock();
+ BasicBlock *VecEpilogueIterationCountCheck =
+ cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock();
+
+ BasicBlock *LoopVectorPreHeader =
+ cast<BranchInst>(VecEpilogueIterationCountCheck->getTerminator())
+ ->getSuccessor(1);
+ // Adjust the control flow taking the state info from the main loop
+ // vectorization into account.
+ assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
+ "expected this to be saved from the previous pass.");
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, LoopVectorPreHeader);
+
+ DTU.applyUpdates({{DominatorTree::Delete, EPI.MainLoopIterationCountCheck,
+ VecEpilogueIterationCountCheck},
+ {DominatorTree::Insert, EPI.MainLoopIterationCountCheck,
+ LoopVectorPreHeader}});
+
+ BasicBlock *ScalarPH =
+ cast<VPIRBasicBlock>(Plan.getScalarPreheader())->getIRBasicBlock();
+ EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, ScalarPH);
+ DTU.applyUpdates(
+ {{DominatorTree::Delete, EPI.EpilogueIterationCountCheck,
+ VecEpilogueIterationCountCheck},
+ {DominatorTree::Insert, EPI.EpilogueIterationCountCheck, ScalarPH}});
+
+ // Adjust the terminators of runtime check blocks and phis using them.
+ BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
+ BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
+ if (SCEVCheckBlock) {
+ SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, ScalarPH);
+ DTU.applyUpdates({{DominatorTree::Delete, SCEVCheckBlock,
+ VecEpilogueIterationCountCheck},
+ {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
+ }
+ if (MemCheckBlock) {
+ MemCheckBlock->getTerminator()->replaceUsesOfWith(
+ VecEpilogueIterationCountCheck, ScalarPH);
+ DTU.applyUpdates(
+ {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
+ {DominatorTree::Insert, MemCheckBlock, ScalarPH}});
+ }
+
+ // The vec.epilog.iter.check block may contain Phi nodes from inductions
+ // or reductions which merge control-flow from the latch block and the
+ // middle block. Update the incoming values here and move the Phi into the
+ // preheader.
+ SmallVector<PHINode *, 4> PhisInBlock(
+ llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
+
+ for (PHINode *Phi : PhisInBlock) {
+ Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
+ Phi->replaceIncomingBlockWith(
+ VecEpilogueIterationCountCheck->getSinglePredecessor(),
+ VecEpilogueIterationCountCheck);
+
+ // If the phi doesn't have an incoming value from the
+ // EpilogueIterationCountCheck, we are done. Otherwise remove the
+ // incoming value and also those from other check blocks. This is needed
+ // for reduction phis only.
+ if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
+ return EPI.EpilogueIterationCountCheck == IncB;
+ }))
+ continue;
+ Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
+ if (SCEVCheckBlock)
+ Phi->removeIncomingValue(SCEVCheckBlock);
+ if (MemCheckBlock)
+ Phi->removeIncomingValue(MemCheckBlock);
+ }
+
+ auto IP = LoopVectorPreHeader->getFirstNonPHIIt();
+ for (auto *I : InstsToMove)
+ I->moveBefore(IP);
+
+ fixScalarResumeValuesFromBypass(AdditionalBypassBlock, L, Plan, LVL,
+ ExpandedSCEVs, EPI.VectorTripCount);
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
assert((EnableVPlanNativePath || L->isInnermost()) &&
"VPlan-native path is not enabled. Only process inner loops.");
@@ -10127,6 +10154,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// factor) again shortly afterwards.
VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
+ BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
BestEpiPlan);
@@ -10140,15 +10168,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// edges from the first pass.
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
BFI, PSI, Checks, BestEpiPlan);
- EpilogILV.setTripCount(MainILV.getTripCount());
- preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
-
+ SmallVector<Instruction *> InstsToMove;
+ preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI,
+ InstsToMove, CM, *PSE.getSE());
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
true);
-
- fixScalarResumeValuesFromBypass(EpilogILV.getAdditionalBypassBlock(), L,
- BestEpiPlan, LVL, ExpandedSCEVs,
- EPI.VectorTripCount);
+ connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
+ Checks, InstsToMove);
++LoopsEpilogueVectorized;
} else {
InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
index 3b435f320b0c2..cf1ee26d754df 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
@@ -50,8 +50,7 @@ define void @test_pr25490(i32 %n, ptr noalias nocapture %a, ptr noalias nocaptur
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index 3a46944712567..dc52e644742e2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -47,8 +47,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -169,8 +168,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
index b83d3af3a0d65..a3b7392dd280f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
@@ -53,8 +53,7 @@ define void @add_i8(ptr noalias nocapture noundef writeonly %A, ptr nocapture no
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -163,8 +162,7 @@ define void @add_i16(ptr noalias nocapture noundef writeonly %A, ptr nocapture n
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -273,8 +271,7 @@ define void @add_i32(ptr noalias nocapture noundef writeonly %A, ptr nocapture n
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -466,8 +463,7 @@ define void @trip_count_based_on_ptrtoint(i64 %x) "target-cpu"="apple-m1" {
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[N_VEC]], 4
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[TMP12]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 87b8c4af1e0c7..307d4c43198af 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -135,8 +135,7 @@ define void @test_widen_induction(ptr %A, i64 %N) {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -221,8 +220,7 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END5:%.*]] = add i64 [[START]], [[N_VEC]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -305,8 +303,7 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[IND_END4]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[IND_END4]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
index 91ec9da11928c..35d7e2cc8c586 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
@@ -58,8 +58,7 @@ define double @fp128_fmuladd_reduction(ptr %start0, ptr %start1, ptr %end0, ptr
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[N_VEC]], 8
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP7]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 7eb52280f4852..cdfe49d089771 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -84,8 +84,7 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]]
; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[START]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index 1c78c5e6f2ce8..893096d52e983 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -41,9 +41,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]])
; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]]
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -174,9 +172,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]])
; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]]
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
index 9bd3d309c0ad9..9b4151f30d640 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
@@ -69,8 +69,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
; INTERLEAVE-4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; INTERLEAVE-4: vec.epilog.iter.check:
-; INTERLEAVE-4-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; INTERLEAVE-4: vec.epilog.ph:
; INTERLEAVE-4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -158,8 +157,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
; INTERLEAVE-2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; INTERLEAVE-2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; INTERLEAVE-2: vec.epilog.iter.check:
-; INTERLEAVE-2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; INTERLEAVE-2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; INTERLEAVE-2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; INTERLEAVE-2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; INTERLEAVE-2: vec.epilog.ph:
; INTERLEAVE-2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index 7ff4609f8ec4b..95e249868ca66 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -51,8 +51,7 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
; INTERLEAVE-4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; INTERLEAVE-4: vec.epilog.iter.check:
-; INTERLEAVE-4-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; INTERLEAVE-4: vec.epilog.ph:
; INTERLEAVE-4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index 58ebc7ce1f8f4..ee3a4a04566c9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -57,8 +57,7 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[N_VEC]], 2
; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP7]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -180,8 +179,7 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
; CHECK-NEXT: [[IND_END7:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST6]]
; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
; CHECK-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index 5066a9b8337bd..bd33af286b05d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -35,8 +35,7 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -132,8 +131,7 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -294,8 +292,7 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -472,8 +469,7 @@ define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -612,8 +608,7 @@ define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index bbb78d4373887..2ee5862daddb0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -88,8 +88,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS1-NEXT: br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK-VS1: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-VS1-NEXT: [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
-; CHECK-VS1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
-; CHECK-VS1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-VS1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-VS1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK-VS1: [[VEC_EPILOG_PH]]:
; CHECK-VS1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -182,8 +181,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
; CHECK-VS2-NEXT: br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK-VS2: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-VS2-NEXT: [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
-; CHECK-VS2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
-; CHECK-VS2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-VS2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-VS2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK-VS2: [[VEC_EPILOG_PH]]:
; CHECK-VS2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 9d4a969b571e7..e1eb33fd89b67 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -87,8 +87,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[IV_NEXT]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index 4aeeb2329ed38..c382145b8a1cb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -36,8 +36,7 @@ define void @cost_store_i8(ptr %dst) #0 {
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 101, [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; DEFAULT: vec.epilog.iter.check:
-; DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 101, [[N_VEC]]
-; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; DEFAULT: vec.epilog.ph:
; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
index 793813e55409e..b78ada07db1b3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
@@ -42,8 +42,7 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
index 44a8eba84a1d0..27779d5ceb0ac 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
@@ -41,8 +41,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
index 4c4c9e57b4ffb..ebc1c1ef1e773 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
@@ -38,8 +38,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 6f605acd7ecbe..a829b4e758727 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -48,8 +48,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -97,8 +96,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK-VF8: vec.epilog.iter.check:
-; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
-; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-VF8: vec.epilog.ph:
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -190,8 +188,7 @@ define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK-VF8: vec.epilog.iter.check:
-; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
-; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-VF8: vec.epilog.ph:
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -270,8 +267,7 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -323,8 +319,7 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK-VF8: vec.epilog.iter.check:
-; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-VF8: vec.epilog.ph:
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -394,8 +389,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 10000, [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -446,8 +440,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK-VF8: vec.epilog.iter.check:
; CHECK-VF8-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 10000, [[N_VEC]]
-; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-VF8: vec.epilog.ph:
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -530,8 +523,7 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -668,8 +660,7 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll
index 761c0b62e98a3..45ffacf7e106a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll
@@ -54,8 +54,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -106,10 +105,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-EPILOG-PREFER-SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK-EPILOG-PREFER-SCALABLE: vec.epilog.iter.check:
-; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
-; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 3
-; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP12]]
+; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP1]]
; CHECK-EPILOG-PREFER-SCALABLE-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-EPILOG-PREFER-SCALABLE: vec.epilog.ph:
; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
index c54511e957ef8..209fa60b260aa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
@@ -37,9 +37,7 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP23:%.*]] = shl nuw i64 [[TMP22]], 1
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP23]]
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index 2c0562bf145ef..b5dfe0e88d057 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -214,8 +214,7 @@ define void @test_interleave_store_one_constant(ptr noalias %src, ptr noalias %d
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -372,8 +371,7 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF7]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
index 21928ce715007..44b4e5a8c2bc7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
@@ -29,8 +29,7 @@ define void @test_remove_vector_loop_region_epilogue(ptr %dst, i1 %c) {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TC]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
index 49cb29ed4e16a..d8f5c699d575f 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
@@ -143,8 +143,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
index 597339b906e0b..c44023eb20046 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
@@ -84,8 +84,7 @@ define void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 %N) {
; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; VF-TWO-CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; VF-TWO-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; VF-TWO-CHECK: [[VEC_EPILOG_PH]]:
; VF-TWO-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -187,8 +186,7 @@ define void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 %N) {
; VF-FOUR-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; VF-FOUR-CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; VF-FOUR-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; VF-FOUR-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; VF-FOUR-CHECK: [[VEC_EPILOG_PH]]:
; VF-FOUR-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -328,8 +326,7 @@ define void @f2(ptr noalias %A, ptr noalias %B, i32 %n) {
; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; VF-TWO-CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; VF-TWO-CHECK-NEXT: [[IND_END18:%.*]] = trunc i64 [[N_VEC]] to i32
-; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; VF-TWO-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
; VF-TWO-CHECK: [[VEC_EPILOG_PH]]:
; VF-TWO-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -453,8 +450,7 @@ define void @f2(ptr noalias %A, ptr noalias %B, i32 %n) {
; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; VF-FOUR-CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; VF-FOUR-CHECK-NEXT: [[IND_END18:%.*]] = trunc i64 [[N_VEC]] to i32
-; VF-FOUR-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; VF-FOUR-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
; VF-FOUR-CHECK: [[VEC_EPILOG_PH]]:
; VF-FOUR-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
index ca39b35aeae1c..d82a3cde4639a 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
@@ -72,8 +72,7 @@ define void @test(ptr %arr, i32 %len) {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 9506ad30c788b..6d2cda48f90ca 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -36,8 +36,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END5:%.*]] = add i64 3, [[N_VEC]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
index 6c1b2568d872a..a2a46affd5cf1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -582,8 +582,7 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 {
; CHECK: middle.block:
; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 2a2f82bfe4b3a..89b3f065e64e6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -222,8 +222,7 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END9:%.*]] = mul i64 [[N_VEC]], 32
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -903,8 +902,7 @@ define i32 @g(i64 %n) {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index 90d261b78c27c..f33c8d9e9ae9b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -54,8 +54,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IV_START]], [[N_VEC]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -174,15 +173,14 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i16> [[STEP_ADD_3]], [[TMP1]]
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[L]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[DOTCAST9:%.*]] = trunc i64 [[N_VEC]] to i16
; CHECK-NEXT: [[IND_END10:%.*]] = mul i16 [[DOTCAST9]], [[TMP0]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[L]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index c0ff8816c2543..3b0ad73d91338 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -59,8 +59,7 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
; AUTO_VEC-NEXT: [[DOTCAST12:%.*]] = sitofp i64 [[N_VEC]] to float
; AUTO_VEC-NEXT: [[TMP11:%.*]] = fmul fast float 5.000000e-01, [[DOTCAST12]]
; AUTO_VEC-NEXT: [[IND_END1:%.*]] = fadd fast float 1.000000e+00, [[TMP11]]
-; AUTO_VEC-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; AUTO_VEC-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; AUTO_VEC: [[VEC_EPILOG_PH]]:
; AUTO_VEC-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -353,8 +352,7 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
; AUTO_VEC-NEXT: [[DOTCAST16:%.*]] = sitofp i64 [[N_VEC]] to float
; AUTO_VEC-NEXT: [[TMP12:%.*]] = fmul reassoc float 4.200000e+01, [[DOTCAST16]]
; AUTO_VEC-NEXT: [[IND_END1:%.*]] = fadd reassoc float 1.000000e+00, [[TMP12]]
-; AUTO_VEC-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; AUTO_VEC-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
; AUTO_VEC: [[VEC_EPILOG_PH]]:
; AUTO_VEC-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 262a5cf7991ae..5204923972547 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -689,8 +689,7 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
; AVX512-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]]
; AVX512-NEXT: [[TMP38:%.*]] = mul i64 [[N_VEC]], 64
; AVX512-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP38]]
-; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
-; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF23:![0-9]+]]
; AVX512: vec.epilog.ph:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 41a4e9c681fad..ae94df3be4923 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -44,8 +44,7 @@ define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index b480eaf7502a8..d75fd0e0023f7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -63,8 +63,7 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
; CHECK-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 2
; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP13]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -190,8 +189,7 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
; CHECK-NEXT: [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]]
; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
index 42d3019cc0ba2..9a3616a4340ff 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -46,8 +46,7 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[SMAX2]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 199f1c15fbc3d..b8862afb4e948 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -61,8 +61,7 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b)
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[SMAX2]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -169,8 +168,7 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[SMAX2]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF17:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -317,8 +315,7 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[SMAX10]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF29:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 18b97f4cded49..db26d33f9c02d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -1378,8 +1378,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; AVX1: [[VEC_EPILOG_ITER_CHECK]]:
-; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[SCALAR_PH]], !prof [[PROF19:![0-9]+]]
; AVX1: [[SCALAR_PH]]:
; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -1471,8 +1470,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; AVX2: [[VEC_EPILOG_ITER_CHECK]]:
-; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[SCALAR_PH]], !prof [[PROF33:![0-9]+]]
; AVX2: [[SCALAR_PH]]:
; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -1564,8 +1562,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; AVX512: [[VEC_EPILOG_ITER_CHECK]]:
-; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[SCALAR_PH]], !prof [[PROF21]]
; AVX512: [[SCALAR_PH]]:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -1702,8 +1699,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; AVX1: [[VEC_EPILOG_ITER_CHECK]]:
-; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[SCALAR_PH]], !prof [[PROF19]]
; AVX1: [[SCALAR_PH]]:
; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -1795,8 +1791,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; AVX2: [[VEC_EPILOG_ITER_CHECK]]:
-; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[SCALAR_PH]], !prof [[PROF33]]
; AVX2: [[SCALAR_PH]]:
; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -1888,8 +1883,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; AVX512: [[VEC_EPILOG_ITER_CHECK]]:
-; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[SCALAR_PH]], !prof [[PROF21]]
; AVX512: [[SCALAR_PH]]:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
index c1adffde07510..31269b1b8c221 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
@@ -57,8 +57,7 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[UMAX2]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
index 994cd331c4194..0cd9b5387c09b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
@@ -63,9 +63,8 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[L44:%.*]], label [[MIDDLE_BLOCK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP8]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]], !prof [[PROF15:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TOP]] ]
; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP8]], 4
@@ -88,7 +87,7 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX7]], 4
; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i64> [[VEC_IND8]], splat (i64 4)
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[L26]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[L26]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC5]]
; CHECK-NEXT: br i1 [[CMP_N15]], label [[L44]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -103,7 +102,7 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
; CHECK-NEXT: store i64 [[DOTUNPACK2]], ptr addrspace(13) [[DOTREPACK4]], align 8, !tbaa [[TBAA10]]
; CHECK-NEXT: [[TMP27]] = add i64 [[VALUE_PHI5]], 1
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[VALUE_PHI5]], [[TMP2]]
-; CHECK-NEXT: br i1 [[DOTNOT]], label [[L44]], label [[L27]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: br i1 [[DOTNOT]], label [[L44]], label [[L27]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: L44:
; CHECK-NEXT: ret ptr addrspace(10) null
;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index df54411f7e710..70c774f6e7d38 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -70,8 +70,7 @@ define void @_Z3fn1v() #0 {
; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[N_VEC]], 2
; CHECK-NEXT: [[IND_END9:%.*]] = add i64 8, [[TMP64]]
; CHECK-NEXT: [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP6]], [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -160,8 +159,7 @@ define void @_Z3fn1v() #0 {
; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2
; CHECK-NEXT: [[IND_END55:%.*]] = add i64 8, [[TMP42]]
; CHECK-NEXT: [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
-; CHECK-NEXT: [[N_VEC_REMAINING49:%.*]] = sub i64 [[TMP28]], [[N_VEC32]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_VEC_REMAINING49]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_MOD_VF31]], 8
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK50]], label %[[VEC_EPILOG_SCALAR_PH40]], label %[[VEC_EPILOG_PH42]], !prof [[PROF3]]
; CHECK: [[VEC_EPILOG_PH42]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
index 6dea2f6b146da..9fe9a14ed13b4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
@@ -104,8 +104,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; NO-VP: vec.epilog.iter.check:
-; NO-VP-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
-; NO-VP-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; NO-VP-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
; NO-VP-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; NO-VP: vec.epilog.ph:
; NO-VP-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
>From 889bcb30de81f87ac4615e37521ae900f3a82b01 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 21 Sep 2025 10:56:11 +0100
Subject: [PATCH 2/2] !fixup address comments, thanks
---
.../Transforms/Vectorize/LoopVectorize.cpp | 77 +++++++------------
.../Vectorize/VPlanConstruction.cpp | 37 +++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 7 ++
3 files changed, 70 insertions(+), 51 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 47c1393e93146..1118ea946fca5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -702,7 +702,6 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
BFI, PSI, Checks, Plan, EPI.EpilogueVF,
EPI.EpilogueVF, EPI.EpilogueUF) {
- TripCount = nullptr;
}
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
@@ -712,7 +711,7 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
/// Emits an iteration count bypass check after the main vector loop has
/// finished to see if there are any iterations left to execute by either
/// the vector epilogue or the scalar epilogue.
- BasicBlock *emitMinimumVectorEpilogueIterCountCheck(BasicBlock *VectorPH,
+ BasicBlock *emitMinimumVectorEpilogueIterationCheck(BasicBlock *VectorPH,
BasicBlock *Bypass,
BasicBlock *Insert);
void printDebugTracesAtStart() override;
@@ -7392,13 +7391,14 @@ BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
// EpilogueVectorizerEpilogueLoop
//===--------------------------------------------------------------------===//
-/// This function is partially responsible for generating the control flow
-/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
+/// This function creates a new scalar preheader, using the previous one as
+/// entry block to the epilogue VPlan. The minimum iteration check is already
+/// created in VPlan.
BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
- BasicBlock *Entry = OrigLoop->getLoopPreheader();
+ BasicBlock *OriginalScalarPH = OrigLoop->getLoopPreheader();
BasicBlock *ScalarPH = createScalarPreheader("vec.epilog.");
- ScalarPH->getSinglePredecessor()->setName("vec.epilog.iter.check");
- VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Entry);
+ OriginalScalarPH->setName("vec.epilog.iter.check");
+ VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
VPBasicBlock *OldEntry = Plan.getEntry();
for (auto &R : make_early_inc_range(
make_range(OldEntry->getFirstNonPhi(), OldEntry->end()))) {
@@ -7409,6 +7409,7 @@ BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
Plan.setEntry(NewEntry);
+ // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
return ScalarPH->getSinglePredecessor();
}
@@ -9489,17 +9490,20 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
}
/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
-/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
-static void preparePlanForEpilogueVectorLoop(
+/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
+/// reductions require creating new instructions to compute the resume values.
+/// They are collected in a vector and returned. They must be moved to the
+/// vector preheader.
+static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
- EpilogueLoopVectorizationInfo &EPI,
- SmallVectorImpl<Instruction *> &InstsToMove, LoopVectorizationCostModel &CM,
+ EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM,
ScalarEvolution &SE) {
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
DenseMap<Value *, Value *> ToFrozen;
+ SmallVector<Instruction *> InstsToMove;
// Ensure that the start values for all header phi recipes are updated before
// vectorizing the epilogue loop.
for (VPRecipeBase &R : Header->phis()) {
@@ -9643,45 +9647,17 @@ static void preparePlanForEpilogueVectorLoop(
ExpandR->eraseFromParent();
}
- // Add the minimum iteration check for the epilogue vector loop.
- VPValue *TC = Plan.getOrAddLiveIn(EPI.TripCount);
- VPBuilder Builder(cast<VPBasicBlock>(Plan.getEntry()));
- VPValue *Count = Builder.createNaryOp(
- Instruction::Sub, {TC, Plan.getOrAddLiveIn(EPI.VectorTripCount)},
- DebugLoc::getUnknown(), "n.vec.remaining");
-
- // Generate code to check if the loop's trip count is less than VF * UF of
- // the vector epilogue loop.
- auto P = CM.requiresScalarEpilogue(EPI.EpilogueVF.isVector())
- ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
- VPValue *VFxUF = Builder.createExpandSCEV(
- SE.getElementCount(EPI.TripCount->getType(),
- (EPI.EpilogueVF * EPI.EpilogueUF), SCEV::FlagNUW));
-
- auto *CheckMinIters = Builder.createICmp(
- P, Count, VFxUF, DebugLoc::getUnknown(), "min.epilog.iters.check");
- VPInstruction *Branch =
- Builder.createNaryOp(VPInstruction::BranchOnCond, CheckMinIters);
-
auto VScale = CM.getVScaleForTuning();
unsigned MainLoopStep =
estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
unsigned EpilogueLoopStep =
estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
- // We assume the remaining `Count` is equally distributed in
- // [0, MainLoopStep)
- // So the probability for `Count < EpilogueLoopStep` should be
- // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
- // TODO: Improve the estimate by taking the estimated trip count into
- // consideration.
- unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
- const uint32_t Weights[] = {EstimatedSkipCount,
- MainLoopStep - EstimatedSkipCount};
- MDBuilder MDB(Plan.getContext());
- MDNode *BranchWeights =
- MDB.createBranchWeights(Weights, /*IsExpected=*/false);
- Branch->addMetadata(LLVMContext::MD_prof, BranchWeights);
+ VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
+ Plan, EPI.TripCount, EPI.VectorTripCount,
+ CM.requiresScalarEpilogue(EPI.EpilogueVF.isVector()), EPI.EpilogueVF,
+ EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
+
+ return InstsToMove;
}
// Generate bypass values from the additional bypass block. Note that when the
@@ -9750,7 +9726,8 @@ static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
/// Connect the epilogue vector loop generated for \p Plan to the main vector
/// loop, updating branches from the iteration and runtime checks, as well as
-/// updating various phis.
+/// updating various phis. \p InstsToMove contains instructions that need to be
+/// moved to the vector preheader.
static void connectEpilogueVectorLoop(
VPlan &Plan, Loop *L, EpilogueLoopVectorizationInfo &EPI, DominatorTree *DT,
LoopVectorizationLegality &LVL,
@@ -9758,8 +9735,7 @@ static void connectEpilogueVectorLoop(
ArrayRef<Instruction *> InstsToMove) {
BasicBlock *AdditionalBypassBlock =
cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock();
- BasicBlock *VecEpilogueIterationCountCheck =
- cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock();
+ BasicBlock *VecEpilogueIterationCountCheck = AdditionalBypassBlock;
BasicBlock *LoopVectorPreHeader =
cast<BranchInst>(VecEpilogueIterationCountCheck->getTerminator())
@@ -10215,9 +10191,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// edges from the first pass.
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
BFI, PSI, Checks, BestEpiPlan);
- SmallVector<Instruction *> InstsToMove;
- preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI,
- InstsToMove, CM, *PSE.getSE());
+ SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
+ BestEpiPlan, L, ExpandedSCEVs, EPI, CM, *PSE.getSE());
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
true);
connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index cef91c15dd873..c8212af9f8e00 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -756,6 +756,43 @@ void VPlanTransforms::addMinimumIterationCheck(
}
}
+void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
+ VPlan &Plan, Value *TripCount, Value *VectorTripCount,
+ bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF,
+ unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE) {
+ // Add the minimum iteration check for the epilogue vector loop.
+ VPValue *TC = Plan.getOrAddLiveIn(TripCount);
+ VPBuilder Builder(cast<VPBasicBlock>(Plan.getEntry()));
+ VPValue *Count = Builder.createNaryOp(
+ Instruction::Sub, {TC, Plan.getOrAddLiveIn(VectorTripCount)},
+ DebugLoc::getUnknown(), "n.vec.remaining");
+
+ // Generate code to check if the loop's trip count is less than VF * UF of
+ // the vector epilogue loop.
+ auto P = RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ VPValue *VFxUF = Builder.createExpandSCEV(SE.getElementCount(
+ TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW));
+
+ auto *CheckMinIters = Builder.createICmp(
+ P, Count, VFxUF, DebugLoc::getUnknown(), "min.epilog.iters.check");
+ VPInstruction *Branch =
+ Builder.createNaryOp(VPInstruction::BranchOnCond, CheckMinIters);
+
+ // We assume the remaining `Count` is equally distributed in
+ // [0, MainLoopStep)
+ // So the probability for `Count < EpilogueLoopStep` should be
+ // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
+ // TODO: Improve the estimate by taking the estimated trip count into
+ // consideration.
+ unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
+ const uint32_t Weights[] = {EstimatedSkipCount,
+ MainLoopStep - EstimatedSkipCount};
+ MDBuilder MDB(Plan.getContext());
+ MDNode *BranchWeights =
+ MDB.createBranchWeights(Weights, /*IsExpected=*/false);
+ Branch->addMetadata(LLVMContext::MD_prof, BranchWeights);
+}
+
bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 69452a7e37572..4c65cb7d7a80d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -117,6 +117,13 @@ struct VPlanTransforms {
bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE);
+ /// Add a check to \p Plan to see if the epilogue vector loop should be
+ /// executed.
+ static void addMinimumVectorEpilogueIterationCheck(
+ VPlan &Plan, Value *TripCount, Value *VectorTripCount,
+ bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF,
+ unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE);
+
/// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
/// flat CFG into a hierarchical CFG.
LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);
More information about the llvm-commits
mailing list