[llvm-branch-commits] [llvm] [VPlan] Start implementing VPlan-based stride multiversioning (PR #182595)
Andrei Elovikov via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Feb 20 13:22:31 PST 2026
https://github.com/eas created https://github.com/llvm/llvm-project/pull/182595
This commit only implements the run-time guard without actually optimizing the vector loop. That would come in a separate PR to ease review.
Stacked on top of https://github.com/llvm/llvm-project/pull/182594.
>From 571ed37feaeae153e33f76022346231e900e3a1b Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Fri, 20 Feb 2026 12:25:15 -0800
Subject: [PATCH] [VPlan] Start implementing VPlan-based stride multiversioning
This commit only implements the run-time guard without actually
optimizing the vector loop. That would come in a separate PR to ease
review.
---
.../Vectorize/LoopVectorizationPlanner.h | 5 +
.../Transforms/Vectorize/LoopVectorize.cpp | 111 +++++++
llvm/lib/Transforms/Vectorize/VPlan.h | 43 +++
.../Transforms/Vectorize/VPlanAnalysis.cpp | 3 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 7 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 17 +-
.../Transforms/Vectorize/VPlanTransforms.h | 5 +
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 5 +-
.../VPlan/vplan-based-stride-mv.ll | 202 +++++++++----
.../LoopVectorize/vplan-based-stride-mv.ll | 286 ++++++++++++++----
10 files changed, 555 insertions(+), 129 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 54bb073eb4f81..189dcf121e65b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -382,6 +382,11 @@ class VPBuilder {
return tryInsertInstruction(new VPExpandSCEVRecipe(Expr));
}
+ VPExpandStridePredicatesRecipe *
+ createExpandSCEVPredicate(const SCEVUnionPredicate &StridePredicates) {
+ return tryInsertInstruction(new VPExpandStridePredicatesRecipe(StridePredicates));
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5ea9fa7ac3288..86d98ce58e87c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -418,6 +418,11 @@ static cl::opt<bool> ConsiderRegPressure(
"vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
cl::desc("Discard VFs if their register pressure is too high."));
+static cl::opt<bool> EnableVPlanBasedStrideMV(
+ "enable-vplan-based-stride-mv", cl::init(false), cl::Hidden,
+ cl::desc("Perform stride multiversioning directly on VPlan instead of in "
+ "LoopAccessAnalysis."));
+
// Likelyhood of bypassing the vectorized loop because there are zero trips left
// after prolog. See `emitIterationCountCheck`.
static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
@@ -10121,6 +10126,10 @@ void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
return false;
});
+ if (EnableVPlanBasedStrideMV)
+ RUN_VPLAN_PASS(VPlanTransforms::multiversionForUnitStridedMemOps, Plan,
+ CostCtx, MemOps);
+
VPlanTransforms::runPass("delegateMemOpWideningToLegacyCM", ProcessSubset,
Plan, [&](VPInstruction *VPI) {
VPRecipeBase *Recipe =
@@ -10132,3 +10141,105 @@ void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
return ReplaceWith(VPI, Recipe);
});
}
+
+void VPlanTransforms::multiversionForUnitStridedMemOps(
+ VPlan &Plan, VPCostContext &CostCtx,
+ SmallVectorImpl<VPInstruction *> &MemOps) {
+ SmallVector<VPInstruction *> RemainingOps;
+ // Makes a copy of VPTypeAnalysis (not sure where the problem is).
+ auto Types = CostCtx.Types;
+
+ ScalarEvolution *SE = CostCtx.PSE.getSE();
+
+ PredicatedScalarEvolution StrideMVPSE(*SE, const_cast<Loop &>(*CostCtx.L));
+
+ SCEVUnionPredicate StridePredicates({}, *SE);
+
+ // Use `for_each` so that we could do `return Skip();`.
+ for_each(MemOps, [&](VPInstruction *VPI) {
+ auto Skip = [&]() { RemainingOps.push_back(VPI); };
+ auto *PtrOp = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0)
+ : VPI->getOperand(1);
+
+ const SCEV *PtrSCEV =
+ vputils::getSCEVExprForVPValue(PtrOp, CostCtx.PSE, CostCtx.L);
+ const SCEV *Start = nullptr;
+ const SCEV *Stride = nullptr;
+
+ if (!match(PtrSCEV, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Stride),
+ m_SpecificLoop(CostCtx.L)))) {
+ return Skip();
+ }
+
+ Type *ScalarTy = Types.inferScalarType(
+ VPI->getOpcode() == Instruction::Load ? VPI : VPI->getOperand(0));
+
+ const SCEV *TypeSize = SE->getSizeOfExpr(
+ Stride->getType(), SE->getDataLayout().getTypeStoreSize(ScalarTy));
+
+ if (isa<SCEVConstant>(Stride)) {
+ // TODO: Process non-MV unit strided accesses prior to this pass so that
+ // we could be sure this one is due another MemOp MV.
+ return Skip();
+ }
+
+ const SCEVConstant *StrideConstantMultiplier;
+ const SCEV *StrideNonConstantMultiplier;
+
+ const SCEV *ToMultiVersion = Stride;
+ const SCEV *MVConst = TypeSize;
+ if (match(Stride, m_scev_c_Mul(m_SCEVConstant(StrideConstantMultiplier),
+ m_SCEV(StrideNonConstantMultiplier)))) {
+ if (TypeSize != StrideConstantMultiplier) {
+ // TODO: Support `TypeSize = N * StrideCosntantMultiplier`,
+ // including negative `N`. For now, only process when they're equal,
+ // which matches the usefull part of the legacy behavior that
+ // multiversiones GEP index for stride one.
+ return Skip();
+ }
+ ToMultiVersion = StrideNonConstantMultiplier;
+ MVConst = SE->getOne(ToMultiVersion->getType());
+ } else if (!TypeSize->isOne()) {
+ // Likewise - try to match legacy behavior.
+ return Skip();
+ }
+
+ if (!isa<SCEVUnknown>(ToMultiVersion)) {
+ // Match legacy behavior.
+ return Skip();
+ }
+
+ StridePredicates = StridePredicates.getUnionWith(
+ SE->getComparePredicate(CmpInst::ICMP_EQ, ToMultiVersion, MVConst),
+ *SE);
+
+ return Skip();
+ });
+
+ MemOps.swap(RemainingOps);
+
+ if (StridePredicates.isAlwaysTrue())
+ return;
+
+ VPBasicBlock *Entry = Plan.getEntry();
+ VPBuilder Builder(Entry);
+
+ auto *Pred =
+ Builder.createExpandSCEVPredicate(StridePredicates);
+
+ auto *StridesCheckBB = Plan.createVPBasicBlock("strides.check");
+ VPBlockBase *ScalarPH = Plan.getScalarPreheader();
+ VPBlockUtils::insertBlockBefore(StridesCheckBB, Plan.getVectorPreheader());
+ VPBlockUtils::connectBlocks(StridesCheckBB, ScalarPH);
+ // SCEVExpander::expandCodeForPredicate would negate the condition, so scalar
+ // preheader should be the first successor.
+ std::swap(StridesCheckBB->getSuccessors()[0],
+ StridesCheckBB->getSuccessors()[1]);
+ Builder.setInsertPoint(StridesCheckBB);
+ Builder.createNaryOp(VPInstruction::BranchOnCond, Pred);
+
+ for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
+ auto &Phi = cast<VPPhi>(R);
+ Phi.addOperand(Phi.getIncomingValueForBlock(Entry));
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a0c23df0b3c38..2a99be00daaac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -406,6 +406,7 @@ class LLVM_ABI_FOR_TEST VPRecipeBase
VPBranchOnMaskSC,
VPDerivedIVSC,
VPExpandSCEVSC,
+ VPExpandStridePredicatesSC,
VPExpressionSC,
VPIRInstructionSC,
VPInstructionSC,
@@ -599,6 +600,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPRecipeValue {
switch (R->getVPRecipeID()) {
case VPRecipeBase::VPDerivedIVSC:
case VPRecipeBase::VPExpandSCEVSC:
+ case VPRecipeBase::VPExpandStridePredicatesSC:
case VPRecipeBase::VPExpressionSC:
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
@@ -3742,6 +3744,47 @@ class VPExpandSCEVRecipe : public VPSingleDefRecipe {
#endif
};
+class VPExpandStridePredicatesRecipe : public VPSingleDefRecipe {
+ SCEVUnionPredicate StridePredicates;
+
+public:
+ VPExpandStridePredicatesRecipe(const SCEVUnionPredicate &StridePredicates)
+ : VPSingleDefRecipe(VPRecipeBase::VPExpandStridePredicatesSC, {}),
+ StridePredicates(StridePredicates) {}
+
+ VPExpandStridePredicatesRecipe(SCEVUnionPredicate &&StridePredicates)
+ : VPSingleDefRecipe(VPRecipeBase::VPExpandStridePredicatesSC, {}),
+ StridePredicates(std::move(StridePredicates)) {}
+
+ ~VPExpandStridePredicatesRecipe() override = default;
+
+ VPExpandStridePredicatesRecipe *clone() override {
+ return new VPExpandStridePredicatesRecipe(StridePredicates);
+ }
+
+ VP_CLASSOF_IMPL(VPRecipeBase::VPExpandStridePredicatesSC)
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("SCEVPredicates must be expanded before final execute");
+ }
+
+ /// Return the cost of this VPExpandSCEVRecipe.
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override {
+ // TODO: Compute accurate cost after retiring the legacy cost model.
+ return 0;
+ }
+
+ const SCEVPredicate *getSCEVPredicate() const { return &StridePredicates; }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void printRecipe(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// Canonical scalar induction phi of the vector loop. Starting at the specified
/// start value (either 0 or the resume value when vectorizing the epilogue
/// loop). VPWidenCanonicalIVRecipe represents the vector version of the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 4b744b9128171..524ef4211e862 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -316,6 +316,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
})
+ .Case([this](const VPExpandStridePredicatesRecipe *R) {
+ return Type::getInt1Ty(Ctx);
+ })
.Case([this](const VPReductionRecipe *R) {
return inferScalarType(R->getChainOp());
})
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 33cb1509565d5..695bc7758c054 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -4508,6 +4508,13 @@ void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
printAsOperand(O, SlotTracker);
O << " = EXPAND SCEV " << *Expr;
}
+
+void VPExpandStridePredicatesRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ printAsOperand(O, SlotTracker);
+ O << " = EXPAND SCEVPredicate " << StridePredicates;
+}
#endif
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 22a8edaf30eb6..b90a9eb342241 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5122,6 +5122,16 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
if (isa<VPIRInstruction, VPIRPhi>(&R))
continue;
+ if (auto *ExpStrides = dyn_cast<VPExpandStridePredicatesRecipe>(&R)) {
+ Value *Res = Expander.expandCodeForPredicate(
+ ExpStrides->getSCEVPredicate(), EntryBB->getTerminator());
+ Res->setName("strides.mv.check");
+ VPValue *Exp = Plan.getOrAddLiveIn(Res);
+
+ ExpStrides->replaceAllUsesWith(Exp);
+ ExpStrides->eraseFromParent();
+ continue;
+ }
auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
if (!ExpSCEV)
break;
@@ -5135,9 +5145,10 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
Plan.resetTripCount(Exp);
ExpSCEV->eraseFromParent();
}
- assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
- "VPExpandSCEVRecipes must be at the beginning of the entry block, "
- "before any VPIRInstructions");
+ assert(none_of(*Entry,
+ IsaPred<VPExpandSCEVRecipe, VPExpandStridePredicatesRecipe>) &&
+ "VPExpandSCEVRecipes/VPExpandStridePredicatesRecipe must be at the "
+ "beginning of the entry block, before any VPIRInstructions");
// Add IR instructions in the entry basic block but not in the VPIRBasicBlock
// to the VPIRBasicBlock.
auto EI = Entry->begin();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 787a687f19cdd..ddc31659b0514 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -487,6 +487,11 @@ struct VPlanTransforms {
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
VPRecipeBuilder &RecipeBuilder,
VPCostContext &CostCtx);
+
+ /// \p MemOps must be updated to contain ones that haven't been processed by
+ /// the pass.
+ static void multiversionForUnitStridedMemOps(VPlan &Plan, VPCostContext &CostCtx,
+ SmallVectorImpl<VPInstruction *> &MemOps);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index f5318bb1c6515..af08539129362 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -389,8 +389,9 @@ bool vputils::isSingleScalar(const VPValue *VPV) {
if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV))
return Expr->isSingleScalar();
- // VPExpandSCEVRecipes must be placed in the entry and are always uniform.
- return isa<VPExpandSCEVRecipe>(VPV);
+ // VPExpandSCEVRecipes and VPExpandStridePredicatesRecipe must be placed in
+ // the entry and are always uniform.
+ return isa<VPExpandSCEVRecipe, VPExpandStridePredicatesRecipe>(VPV);
}
bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-based-stride-mv.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-based-stride-mv.ll
index 33eac6bc08b26..610d8cc63217f 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-based-stride-mv.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-based-stride-mv.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
; RUN: opt < %s -p loop-vectorize -force-vector-width=4 -disable-output \
-; RUN: -vplan-print-after=scalarizeMemOpsWithIrregularTypes \
-; RUN: -enable-mem-access-versioning=false 2>&1 | FileCheck %s
+; RUN: -vplan-print-after=multiversionForUnitStridedMemOps \
+; RUN: -enable-mem-access-versioning=false -enable-vplan-based-stride-mv 2>&1 | FileCheck %s
define void @basic(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-LABEL: VPlan for loop in 'basic'
@@ -12,6 +12,12 @@ define void @basic(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: Live-in ir<128> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP3]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -19,7 +25,7 @@ define void @basic(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%idx> = mul ir<%iv>, ir<%stride>
@@ -28,8 +34,8 @@ define void @basic(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%ld>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<128>
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -44,11 +50,11 @@ define void @basic(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP7:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP7]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP9]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %idx = mul i64 %iv, %stride
; CHECK-NEXT: IR %gep.ld = getelementptr i64, ptr %p, i64 %idx
@@ -90,6 +96,12 @@ define void @byte_gep_scaled_stride(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: Live-in ir<128> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP3]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -97,7 +109,7 @@ define void @byte_gep_scaled_stride(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%stride.x8> = mul ir<%stride>, ir<8>
@@ -107,8 +119,8 @@ define void @byte_gep_scaled_stride(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%ld>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<128>
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -123,11 +135,11 @@ define void @byte_gep_scaled_stride(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP7:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP7]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP9]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %stride.x8 = mul i64 %stride, 8
; CHECK-NEXT: IR %idx = mul i64 %iv, %stride.x8
@@ -573,6 +585,12 @@ define void @shared_stride(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride) {
; CHECK-NEXT: Live-in ir<128> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP3]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -580,7 +598,7 @@ define void @shared_stride(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride) {
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%idx> = mul ir<%iv>, ir<%stride>
@@ -592,8 +610,8 @@ define void @shared_stride(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride) {
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%val>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<128>
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -608,11 +626,11 @@ define void @shared_stride(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP7:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP7]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP9]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %idx = mul i64 %iv, %stride
; CHECK-NEXT: IR %gep.ld0 = getelementptr i64, ptr %p0, i64 %idx
@@ -660,6 +678,12 @@ define void @dependent_strides(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride
; CHECK-NEXT: Live-in ir<128> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP3]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -667,7 +691,7 @@ define void @dependent_strides(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%stride1> = add ir<%stride>, ir<1>
@@ -681,8 +705,8 @@ define void @dependent_strides(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%val>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<128>
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -697,11 +721,11 @@ define void @dependent_strides(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP7:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP7]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP9]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %stride1 = add i64 %stride, 1
; CHECK-NEXT: IR %idx0 = mul i64 %iv, %stride
@@ -753,6 +777,12 @@ define void @dependent_strides_reverse_order(ptr noalias %p.out, ptr %p0, ptr %p
; CHECK-NEXT: Live-in ir<128> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP3]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -760,7 +790,7 @@ define void @dependent_strides_reverse_order(ptr noalias %p.out, ptr %p0, ptr %p
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%stride0> = add ir<%stride>, ir<1>
@@ -774,8 +804,8 @@ define void @dependent_strides_reverse_order(ptr noalias %p.out, ptr %p0, ptr %p
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%val>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<128>
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -790,11 +820,11 @@ define void @dependent_strides_reverse_order(ptr noalias %p.out, ptr %p0, ptr %p
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP7:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP7]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP9]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %stride0 = add i64 %stride, 1
; CHECK-NEXT: IR %idx0 = mul i64 %iv, %stride0
@@ -1033,6 +1063,12 @@ define void @strided_interleave(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: Live-in ir<128> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP3]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -1040,7 +1076,7 @@ define void @strided_interleave(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%idx> = mul ir<%iv>, ir<%stride>
@@ -1052,8 +1088,8 @@ define void @strided_interleave(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%val>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<128>
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -1068,11 +1104,11 @@ define void @strided_interleave(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP7:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP7]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP9]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %idx = mul i64 %iv, %stride
; CHECK-NEXT: IR %gep.ld0 = getelementptr i64, ptr %p, i64 %idx
@@ -1120,6 +1156,12 @@ define void @in_loop_base(ptr noalias %p.out, ptr %p, i64 %stride, i64 %offset)
; CHECK-NEXT: Live-in ir<128> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP3]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -1127,7 +1169,7 @@ define void @in_loop_base(ptr noalias %p.out, ptr %p, i64 %stride, i64 %offset)
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%iv.sdiv32> = sdiv ir<%iv>, ir<32>
@@ -1138,8 +1180,8 @@ define void @in_loop_base(ptr noalias %p.out, ptr %p, i64 %stride, i64 %offset)
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%ld>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<128>
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -1154,11 +1196,11 @@ define void @in_loop_base(ptr noalias %p.out, ptr %p, i64 %stride, i64 %offset)
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP7:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP7]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP9]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %iv.sdiv32 = sdiv i64 %iv, 32
; CHECK-NEXT: IR %mul = mul i64 %iv, %stride
@@ -1382,6 +1424,12 @@ define void @non_constant_btc(ptr noalias %p.out, ptr %p, i64 %stride, i64 %n) {
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
; CHECK-NEXT: EMIT vp<[[VP3]]> = EXPAND SCEV (1 smax %n)
+; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP4]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -1389,7 +1437,7 @@ define void @non_constant_btc(ptr noalias %p.out, ptr %p, i64 %stride, i64 %n) {
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%idx> = mul ir<%iv>, ir<%stride>
@@ -1398,8 +1446,8 @@ define void @non_constant_btc(ptr noalias %p.out, ptr %p, i64 %stride, i64 %n) {
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%ld>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<%n>
-; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP6]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -1414,11 +1462,11 @@ define void @non_constant_btc(ptr noalias %p.out, ptr %p, i64 %stride, i64 %n) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP10:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP8]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP10]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %idx = mul i64 %iv, %stride
; CHECK-NEXT: IR %gep.ld = getelementptr i64, ptr %p, i64 %idx
@@ -1461,6 +1509,12 @@ define void @stride_as_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
; CHECK-NEXT: EMIT vp<[[VP3]]> = EXPAND SCEV (1 smax %stride)
+; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP4]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -1468,7 +1522,7 @@ define void @stride_as_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%idx> = mul ir<%iv>, ir<%stride>
@@ -1477,8 +1531,8 @@ define void @stride_as_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%ld>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<%stride>
-; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP6]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -1493,11 +1547,11 @@ define void @stride_as_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP10:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP8]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP10]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %idx = mul i64 %iv, %stride
; CHECK-NEXT: IR %gep.ld = getelementptr i64, ptr %p, i64 %idx
@@ -1541,6 +1595,12 @@ define void @stride_dependent_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: ir-bb<entry>:
; CHECK-NEXT: IR %n = add i64 %stride, 1
; CHECK-NEXT: EMIT vp<[[VP3]]> = EXPAND SCEV (1 smax (1 + %stride))
+; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP4]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -1548,7 +1608,7 @@ define void @stride_dependent_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%idx> = mul ir<%iv>, ir<%stride>
@@ -1557,8 +1617,8 @@ define void @stride_dependent_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%ld>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<%n>
-; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP6]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -1573,11 +1633,11 @@ define void @stride_dependent_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP10:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP8]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP10]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %idx = mul i64 %iv, %stride
; CHECK-NEXT: IR %gep.ld = getelementptr i64, ptr %p, i64 %idx
@@ -1702,6 +1762,12 @@ define void @nd_array_last_idx(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: Live-in ir<128> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP3]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -1709,7 +1775,7 @@ define void @nd_array_last_idx(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%idx> = mul ir<%iv>, ir<%stride>
@@ -1718,8 +1784,8 @@ define void @nd_array_last_idx(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: EMIT ir<%gep.st> = getelementptr ir<%p.out>, ir<%iv>
; CHECK-NEXT: EMIT store ir<%ld>, ir<%gep.st>
; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<128>
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -1734,11 +1800,11 @@ define void @nd_array_last_idx(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP7:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP7]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ] (extra operand: vp<[[VP9]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %idx = mul i64 %iv, %stride
; CHECK-NEXT: IR %gep.ld = getelementptr [256 x [256 x i64]], ptr %p, i64 1, i64 42, i64 %idx
@@ -2201,6 +2267,12 @@ define void @basic_masked(ptr noalias %p.out, ptr %p, i64 %stride, i64 %x) {
; CHECK-NEXT: Live-in ir<128> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = EXPAND SCEVPredicate Equal predicate: %stride == 1
+; CHECK-EMPTY:
+; CHECK-NEXT: Successor(s): scalar.ph, strides.check
+; CHECK-EMPTY:
+; CHECK-NEXT: strides.check:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[VP3]]>
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
@@ -2208,7 +2280,7 @@ define void @basic_masked(ptr noalias %p.out, ptr %p, i64 %stride, i64 %x) {
; CHECK-EMPTY:
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VP0]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nsw ir<%iv>, ir<1>
; CHECK-NEXT: EMIT ir<%c> = icmp sge ir<%iv>, ir<%x>
@@ -2223,11 +2295,11 @@ define void @basic_masked(ptr noalias %p.out, ptr %p, i64 %stride, i64 %x) {
; CHECK-NEXT: Successor(s): latch
; CHECK-EMPTY:
; CHECK-NEXT: latch:
-; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = not ir<%c>
-; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = or ir<%c>, vp<[[VP4]]>
-; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<128>, vp<[[VP5]]>
-; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = not ir<%exitcond>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = not ir<%c>
+; CHECK-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = or ir<%c>, vp<[[VP6]]>
+; CHECK-NEXT: EMIT ir<%exitcond> = icmp sge ir<%iv.next>, ir<128>, vp<[[VP7]]>
+; CHECK-NEXT: EMIT vp<[[VP8:%[0-9]+]]> = not ir<%exitcond>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP5]]>, vp<[[VP1]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
@@ -2242,11 +2314,11 @@ define void @basic_masked(ptr noalias %p.out, ptr %p, i64 %stride, i64 %x) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<[[VP11:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, strides.check ]
; CHECK-NEXT: Successor(s): ir-bb<header>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<header>:
-; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] (extra operand: vp<[[VP9]]> from scalar.ph)
+; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] (extra operand: vp<[[VP11]]> from scalar.ph)
; CHECK-NEXT: IR %iv.next = add nsw i64 %iv, 1
; CHECK-NEXT: IR %c = icmp sge i64 %iv, %x
; CHECK-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-based-stride-mv.ll b/llvm/test/Transforms/LoopVectorize/vplan-based-stride-mv.ll
index b9688d1ecfd1c..46ae750441f1e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-based-stride-mv.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-based-stride-mv.ll
@@ -1,19 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
; RUN: opt < %s -p loop-vectorize -force-vector-width=4 -S \
-; RUN: -enable-mem-access-versioning=false 2>&1 | FileCheck %s
+; RUN: -enable-mem-access-versioning=false -enable-vplan-based-stride-mv 2>&1 | FileCheck %s
define void @basic(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-LABEL: define void @basic(
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P:%.*]], i64 [[STRIDE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
@@ -39,6 +42,18 @@ define void @basic(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[IDX:%.*]] = mul i64 [[IV]], [[STRIDE]]
+; CHECK-NEXT: [[GEP_LD:%.*]] = getelementptr i64, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
+; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
+; CHECK-NEXT: store i64 [[LD]], ptr [[GEP_ST]], align 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -68,15 +83,18 @@ define void @byte_gep_scaled_stride(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-LABEL: define void @byte_gep_scaled_stride(
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P:%.*]], i64 [[STRIDE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = shl <4 x i64> [[BROADCAST_SPLAT]], splat (i64 3)
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i64> [[VEC_IND]], [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
@@ -99,9 +117,22 @@ define void @byte_gep_scaled_stride(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[STRIDE_X8:%.*]] = mul i64 [[STRIDE]], 8
+; CHECK-NEXT: [[IDX:%.*]] = mul i64 [[IV]], [[STRIDE_X8]]
+; CHECK-NEXT: [[GEP_LD:%.*]] = getelementptr i8, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
+; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
+; CHECK-NEXT: store i64 [[LD]], ptr [[GEP_ST]], align 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -163,7 +194,7 @@ define void @byte_gep_under_scaled_stride(ptr noalias %p.out, ptr %p, i64 %strid
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -227,7 +258,7 @@ define void @byte_gep_over_scaled_stride(ptr noalias %p.out, ptr %p, i64 %stride
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -291,7 +322,7 @@ define void @byte_gep_non_power_of_two_scaled_stride(ptr noalias %p.out, ptr %p,
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -354,7 +385,7 @@ define void @byte_gep_nonscaled_stride(ptr noalias %p.out, ptr %p, i64 %stride)
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -417,7 +448,7 @@ define void @byte_gep_negated_stride(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -450,14 +481,17 @@ define void @shared_stride(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride) {
; CHECK-LABEL: define void @shared_stride(
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P0:%.*]], ptr [[P1:%.*]], i64 [[STRIDE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
@@ -493,9 +527,24 @@ define void @shared_stride(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[IDX:%.*]] = mul i64 [[IV]], [[STRIDE]]
+; CHECK-NEXT: [[GEP_LD0:%.*]] = getelementptr i64, ptr [[P0]], i64 [[IDX]]
+; CHECK-NEXT: [[GEP_LD1:%.*]] = getelementptr i64, ptr [[P1]], i64 [[IDX]]
+; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[GEP_LD0]], align 8
+; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[GEP_LD1]], align 8
+; CHECK-NEXT: [[VAL:%.*]] = add i64 [[LD0]], [[LD1]]
+; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
+; CHECK-NEXT: store i64 [[VAL]], ptr [[GEP_ST]], align 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -528,15 +577,18 @@ define void @dependent_strides(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride
; CHECK-LABEL: define void @dependent_strides(
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P0:%.*]], ptr [[P1:%.*]], i64 [[STRIDE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], splat (i64 1)
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
@@ -577,9 +629,26 @@ define void @dependent_strides(ptr noalias %p.out, ptr %p0, ptr %p1, i64 %stride
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[STRIDE1:%.*]] = add i64 [[STRIDE]], 1
+; CHECK-NEXT: [[IDX0:%.*]] = mul i64 [[IV]], [[STRIDE]]
+; CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[IV]], [[STRIDE1]]
+; CHECK-NEXT: [[GEP_LD0:%.*]] = getelementptr i64, ptr [[P0]], i64 [[IDX0]]
+; CHECK-NEXT: [[GEP_LD1:%.*]] = getelementptr i64, ptr [[P1]], i64 [[IDX1]]
+; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[GEP_LD0]], align 8
+; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[GEP_LD1]], align 8
+; CHECK-NEXT: [[VAL:%.*]] = add i64 [[LD0]], [[LD1]]
+; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
+; CHECK-NEXT: store i64 [[VAL]], ptr [[GEP_ST]], align 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -614,15 +683,18 @@ define void @dependent_strides_reverse_order(ptr noalias %p.out, ptr %p0, ptr %p
; CHECK-LABEL: define void @dependent_strides_reverse_order(
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P0:%.*]], ptr [[P1:%.*]], i64 [[STRIDE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], splat (i64 1)
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i64> [[VEC_IND]], [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
@@ -663,9 +735,26 @@ define void @dependent_strides_reverse_order(ptr noalias %p.out, ptr %p0, ptr %p
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[STRIDE0:%.*]] = add i64 [[STRIDE]], 1
+; CHECK-NEXT: [[IDX0:%.*]] = mul i64 [[IV]], [[STRIDE0]]
+; CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[IV]], [[STRIDE]]
+; CHECK-NEXT: [[GEP_LD0:%.*]] = getelementptr i64, ptr [[P0]], i64 [[IDX0]]
+; CHECK-NEXT: [[GEP_LD1:%.*]] = getelementptr i64, ptr [[P1]], i64 [[IDX1]]
+; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[GEP_LD0]], align 8
+; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[GEP_LD1]], align 8
+; CHECK-NEXT: [[VAL:%.*]] = add i64 [[LD0]], [[LD1]]
+; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
+; CHECK-NEXT: store i64 [[VAL]], ptr [[GEP_ST]], align 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -745,7 +834,7 @@ define void @byte_dependent_byte_geps(ptr noalias %p.out, ptr %p0, ptr %p1, i64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -828,7 +917,7 @@ define void @byte_dependent_byte_geps_reverse_order(ptr noalias %p.out, ptr %p0,
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -867,14 +956,17 @@ define void @strided_interleave(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-LABEL: define void @strided_interleave(
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P:%.*]], i64 [[STRIDE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
@@ -910,9 +1002,24 @@ define void @strided_interleave(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[IDX:%.*]] = mul i64 [[IV]], [[STRIDE]]
+; CHECK-NEXT: [[GEP_LD0:%.*]] = getelementptr i64, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT: [[GEP_LD1:%.*]] = getelementptr i64, ptr [[GEP_LD0]], i61 1
+; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[GEP_LD0]], align 8
+; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[GEP_LD1]], align 8
+; CHECK-NEXT: [[VAL:%.*]] = add i64 [[LD0]], [[LD1]]
+; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
+; CHECK-NEXT: store i64 [[VAL]], ptr [[GEP_ST]], align 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -945,16 +1052,19 @@ define void @in_loop_base(ptr noalias %p.out, ptr %p, i64 %stride, i64 %offset)
; CHECK-LABEL: define void @in_loop_base(
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P:%.*]], i64 [[STRIDE:%.*]], i64 [[OFFSET:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[OFFSET]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i64> [[TMP0]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
@@ -978,9 +1088,23 @@ define void @in_loop_base(ptr noalias %p.out, ptr %p, i64 %stride, i64 %offset)
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[IV_SDIV32:%.*]] = sdiv i64 [[IV]], 32
+; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[IV]], [[STRIDE]]
+; CHECK-NEXT: [[IDX:%.*]] = add i64 [[MUL]], [[OFFSET]]
+; CHECK-NEXT: [[GEP_LD:%.*]] = getelementptr i64, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
+; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
+; CHECK-NEXT: store i64 [[LD]], ptr [[GEP_ST]], align 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -1045,7 +1169,7 @@ define void @non_invariant_uniform_base(ptr noalias %p.out, ptr %p, i64 %stride)
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -1113,7 +1237,7 @@ define void @non_invariant_uniform_stride(ptr noalias %p.out, ptr %p, ptr %p.uni
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -1153,17 +1277,20 @@ define void @non_constant_btc(ptr noalias %p.out, ptr %p, i64 %stride, i64 %n) {
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P:%.*]], i64 [[STRIDE:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
@@ -1186,12 +1313,12 @@ define void @non_constant_btc(ptr noalias %p.out, ptr %p, i64 %stride, i64 %n) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_PH]] ]
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
@@ -1202,7 +1329,7 @@ define void @non_constant_btc(ptr noalias %p.out, ptr %p, i64 %stride, i64 %n) {
; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
; CHECK-NEXT: store i64 [[LD]], ptr [[GEP_ST]], align 8
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP26:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -1233,17 +1360,20 @@ define void @stride_as_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P:%.*]], i64 [[STRIDE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[STRIDE]], i64 1)
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
@@ -1266,12 +1396,12 @@ define void @stride_as_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_PH]] ]
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
@@ -1282,7 +1412,7 @@ define void @stride_as_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
; CHECK-NEXT: store i64 [[LD]], ptr [[GEP_ST]], align 8
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], [[STRIDE]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP28:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -1314,17 +1444,20 @@ define void @stride_dependent_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[N:%.*]] = add i64 [[STRIDE]], 1
; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
@@ -1347,12 +1480,12 @@ define void @stride_dependent_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_PH]] ]
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
@@ -1363,7 +1496,7 @@ define void @stride_dependent_btc(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
; CHECK-NEXT: store i64 [[LD]], ptr [[GEP_ST]], align 8
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP30:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -1428,7 +1561,7 @@ define void @actual_stride_not_in_ir(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -1462,14 +1595,17 @@ define void @nd_array_last_idx(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-LABEL: define void @nd_array_last_idx(
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P:%.*]], i64 [[STRIDE:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
@@ -1492,9 +1628,21 @@ define void @nd_array_last_idx(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[HEADER]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[IDX:%.*]] = mul i64 [[IV]], [[STRIDE]]
+; CHECK-NEXT: [[GEP_LD:%.*]] = getelementptr [256 x [256 x i64]], ptr [[P]], i64 1, i64 42, i64 [[IDX]]
+; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
+; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
+; CHECK-NEXT: store i64 [[LD]], ptr [[GEP_ST]], align 8
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP33:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -1554,7 +1702,7 @@ define void @nd_array_non_last_idx(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -1616,7 +1764,7 @@ define void @nd_array_multiple_idxs(ptr noalias %p.out, ptr %p, i64 %stride) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -1679,7 +1827,7 @@ define void @sext_stride(ptr noalias %p.out, ptr %p, i32 %stride.i32) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
@@ -1757,7 +1905,7 @@ define void @trunc_stride(ptr noalias %p.out, ptr %p, i64 %stride.i64) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[SCALAR_PH]]:
@@ -1772,7 +1920,7 @@ define void @trunc_stride(ptr noalias %p.out, ptr %p, i64 %stride.i64) {
; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i32, ptr [[P_OUT]], i32 [[IV]]
; CHECK-NEXT: store i32 [[LD]], ptr [[GEP_ST]], align 8
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[IV_NEXT]], 128
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -1871,7 +2019,7 @@ define void @trunc_ext_stride(ptr noalias %p.out, ptr %p0, ptr %p1, i32 %stride)
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-NEXT: [[VEC_IND_NEXT4]] = add <4 x i16> [[VEC_IND3]], splat (i16 4)
; CHECK-NEXT: [[TMP45:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP45]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP45]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[SCALAR_PH]]:
@@ -1891,7 +2039,7 @@ define void @trunc_ext_stride(ptr noalias %p.out, ptr %p0, ptr %p1, i32 %stride)
; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i32, ptr [[P_OUT]], i32 [[IV]]
; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP_ST]], align 8
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[IV_NEXT]], 128
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP40:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -1932,16 +2080,19 @@ define void @basic_masked(ptr noalias %p.out, ptr %p, i64 %stride, i64 %x) {
; CHECK-LABEL: define void @basic_masked(
; CHECK-SAME: ptr noalias [[P_OUT:%.*]], ptr [[P:%.*]], i64 [[STRIDE:%.*]], i64 [[X:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[STRIDES_MV_CHECK:%.*]] = icmp ne i64 [[STRIDE]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br i1 [[STRIDES_MV_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH1:.*]]
+; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[STRIDE]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE8:.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE8]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE8:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE8]] ]
; CHECK-NEXT: [[TMP0:%.*]] = icmp sge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
@@ -1991,9 +2142,26 @@ define void @basic_masked(ptr noalias %p.out, ptr %p, i64 %stride, i64 %x) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: br label %[[HEADER:.*]]
+; CHECK: [[HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT: [[C:%.*]] = icmp sge i64 [[IV]], [[X]]
+; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[LATCH]]
+; CHECK: [[IF]]:
+; CHECK-NEXT: [[IDX:%.*]] = mul i64 [[IV]], [[STRIDE]]
+; CHECK-NEXT: [[GEP_LD:%.*]] = getelementptr i64, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT: [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
+; CHECK-NEXT: [[GEP_ST:%.*]] = getelementptr i64, ptr [[P_OUT]], i64 [[IV]]
+; CHECK-NEXT: store i64 [[LD]], ptr [[GEP_ST]], align 8
+; CHECK-NEXT: br label %[[LATCH]]
+; CHECK: [[LATCH]]:
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[HEADER]], label %[[EXIT]], !llvm.loop [[LOOP42:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
More information about the llvm-branch-commits
mailing list