[llvm] Draft: [LV] Outer-loop vectorization in the default vectorizer codepath (PR #128202)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 21 09:01:03 PST 2025
https://github.com/iamlouk created https://github.com/llvm/llvm-project/pull/128202
This is a draft MR to get feedback if something like this would be considered
a good-enough approach by current maintainers to merge into LLVM. I would split
it into smaller pieces if the general direction is not conflicting with current plans.
It implements outer-loop vectorization *outside* the VPlan-native path. Minimal
LoopAccessAnalysis support for non-innermost loops was added relying on the
`!llvm.loop.parallel_accesses` metadata.
Unlike for the VPlan-native path, inner loops with non-invariant trip-counts or
non-uniform inductions are supported, and the quality of the emitted code is better
than that of the current VPlan-native path.
A implementation very close to this one (#124432 required some changes but
also simplified this MR a lot) was successfully tested in combination with
basic LAA MemoryDepChecker support (not part of this MR) for outer loops on the
llvm-test-suite and SPEC (~3000 loops, outer-loop vect. was forced).
As a real-world motivational example, [this loop](https://github.com/HydroBench/Hydro/blob/6fa22ca83df6b355abf1eba42a9de6a24346b48e/HydroC/HydroCplusMPI/Tile.cpp#L1095)
can be looked at. Performance is more than doubled when outer-loop vectorizing it.
Some code for the VPWidenPHIRecipes is duplicated from #128187.
>From 72dccc1b5ded92345de8b63048f2995c223b29dc Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Thu, 20 Feb 2025 20:56:40 +0100
Subject: [PATCH 1/5] [VPlan] Update entry/exiting blocks in VPRegionBlocks
---
llvm/lib/Transforms/Vectorize/VPlanUtils.h | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 6ddb88308955f..fd197fc8add2e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -96,6 +96,9 @@ class VPBlockUtils {
connectBlocks(NewBlock, Succ);
}
connectBlocks(BlockPtr, NewBlock);
+ VPRegionBlock *Parent = BlockPtr->getParent();
+ if (Parent && Parent->getExiting() == BlockPtr)
+ Parent->setExiting(NewBlock);
}
/// Insert disconnected block \p NewBlock before \p Blockptr. First
@@ -112,6 +115,9 @@ class VPBlockUtils {
connectBlocks(Pred, NewBlock);
}
connectBlocks(NewBlock, BlockPtr);
+ VPRegionBlock *Parent = BlockPtr->getParent();
+ if (Parent && Parent->getEntry() == BlockPtr)
+ Parent->setEntry(NewBlock);
}
/// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
>From 3819995b4647718b3b66ff941d40e9a3184f6bef Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Thu, 20 Feb 2025 21:03:45 +0100
Subject: [PATCH 2/5] [VPlan] Cloning and unrolling for VPWidenPHIRecipe
---
llvm/lib/Transforms/Vectorize/VPlan.h | 6 +++-
llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 31 +++++++++++++++++++
2 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8089cfd1ce802..15e90bc18bc87 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1974,7 +1974,11 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe {
}
VPWidenPHIRecipe *clone() override {
- llvm_unreachable("cloning not implemented yet");
+ auto *Phi = new VPWidenPHIRecipe(
+ dyn_cast_if_present<PHINode>(getUnderlyingValue()));
+ for (unsigned I = 0; I < getNumOperands(); I++)
+ Phi->addOperand(getIncomingValue(I));
+ return Phi;
}
~VPWidenPHIRecipe() override = default;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 89e372d6b46cf..0b46e043e873d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -384,6 +384,21 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
continue;
}
+ // Handle inner-loop/region header phis. The backedge values will be set
+ // later. Phis not in a loop header can be unrolled like any other recipes,
+ // RPO makes sure the predecessors are all visited first.
+ VPRegionBlock *Region = R.getParent()->getParent();
+ if (auto *P = dyn_cast<VPWidenPHIRecipe>(&R);
+ P && Region->getEntryBasicBlock() == P->getParent()) {
+ auto InsertPt = std::next(R.getIterator());
+ for (unsigned Part = 1; Part != UF; ++Part) {
+ VPWidenPHIRecipe *Copy = P->clone();
+ Copy->insertBefore(*R.getParent(), InsertPt);
+ addRecipeForPart(&R, Copy, Part);
+ }
+ continue;
+ }
+
unrollRecipeByUF(R);
}
}
@@ -442,5 +457,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
Part++;
}
+ // Remap operands of cloned inner-loop header phis to update backedge values,
+ // a problem unique to outer-loop vectorization.
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
+ DeepRPOT(Plan.getEntry());
+ for (VPRegionBlock *Region :
+ VPBlockUtils::blocksOnly<VPRegionBlock>(DeepRPOT))
+ for (VPRecipeBase &R : Region->getEntryBasicBlock()->phis())
+ if (auto *Phi = dyn_cast<VPWidenPHIRecipe>(&R)) {
+ if (Unroller.contains(Phi->getVPSingleValue())) {
+ Part = 1;
+ continue;
+ }
+ Unroller.remapOperands(&R, Part);
+ Part++;
+ }
+
VPlanTransforms::removeDeadRecipes(Plan);
}
>From 407d320aba89140c06e59f326847c5b6854a3359 Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Thu, 20 Feb 2025 21:04:35 +0100
Subject: [PATCH 3/5] [VPlan] Unrolling of VPInstruction::AnyOf
---
llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 22 +++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 0b46e043e873d..2360a20d78cd5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -373,6 +373,28 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
continue;
}
+ if (auto *Any = dyn_cast<VPInstruction>(&R);
+ Any && Any->getOpcode() == VPInstruction::AnyOf) {
+ VPValue *Res = Any;
+ VPRecipeBase *FirstOr = nullptr;
+ for (unsigned Part = 1; Part != UF; ++Part) {
+ auto *NewAny = new VPInstruction(
+ VPInstruction::AnyOf, {getValueForPart(Any->getOperand(0), Part)},
+ Any->getDebugLoc());
+ NewAny->insertAfter(Res->getDefiningRecipe());
+ auto *Or = new VPInstruction(Instruction::Or, {Res, NewAny},
+ Any->getDebugLoc());
+ Or->insertAfter(NewAny->getDefiningRecipe());
+ ToSkip.insert(Or);
+ if (Part == 1)
+ FirstOr = Or;
+ Res = Or;
+ }
+ Any->getVPSingleValue()->replaceAllUsesWith(Res);
+ FirstOr->setOperand(0, Any);
+ continue;
+ }
+
auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
addUniformForAllParts(SingleDef);
>From 66556d57feadce2782b3498c52171d8fa564c48a Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Thu, 20 Feb 2025 21:06:00 +0100
Subject: [PATCH 4/5] [LAA] Basic initial outer-loop support
---
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 69 ++++++++--
.../LoopAccessAnalysis/outer-loops.ll | 128 ++++++++++++++++++
2 files changed, 186 insertions(+), 11 deletions(-)
create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index a1d91de3bb788..6fe7a8a9eed69 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -792,21 +792,65 @@ class AccessAnalysis {
} // end anonymous namespace
+/// Return true if \p E is invariant with regards to the Loop \p L.
+/// If \p E is a recurrence around a inner loop of \p L, then the
+/// start and step of that inner loop recurrence must be invariant
+/// to \p L.
+static bool isInvariantToTheLoop(const Loop *L, ScalarEvolution &SE,
+ const SCEV *E) {
+ if (SE.isLoopInvariant(E, L))
+ return true;
+
+ if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(E);
+ AddRec && L != AddRec->getLoop() && L->contains(AddRec->getLoop())) {
+ for (auto *Op : AddRec->operands())
+ if (!isInvariantToTheLoop(L, SE, Op))
+ return false;
+
+ return true;
+ }
+
+ return false;
+}
+
/// Try to compute a constant stride for \p AR. Used by getPtrStride and
/// isNoWrap.
static std::optional<int64_t>
getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
Value *Ptr, PredicatedScalarEvolution &PSE) {
- // The access function must stride over the innermost loop.
+ // The access function must stride over the queried loop.
if (Lp != AR->getLoop()) {
- LLVM_DEBUG({
- dbgs() << "LAA: Bad stride - Not striding over innermost loop ";
- if (Ptr)
- dbgs() << *Ptr << " ";
+ assert(!Lp->isInnermost() && Lp->contains(AR->getLoop()) &&
+ "Classic SE should have detected invariance");
+ while (AR && Lp != AR->getLoop()) {
+ if (isInvariantToTheLoop(Lp, *PSE.getSE(), AR))
+ return {0};
+
+ const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
+ if (!isInvariantToTheLoop(Lp, *PSE.getSE(), Step)) {
+ LLVM_DEBUG({
+ dbgs() << "LAA: Bad stride - Depends on inner loop ";
+ if (Ptr)
+ dbgs() << *Ptr << " ";
+
+ dbgs() << "SCEV: " << *AR << "\n";
+ });
+ return std::nullopt;
+ }
- dbgs() << "SCEV: " << *AR << "\n";
- });
- return std::nullopt;
+ AR = dyn_cast<SCEVAddRecExpr>(AR->getStart());
+ }
+
+ if (!AR || Lp != AR->getLoop()) {
+ LLVM_DEBUG({
+ dbgs() << "LAA: Bad stride - Strides over inner loop ";
+ if (Ptr)
+ dbgs() << *Ptr << " ";
+
+ dbgs() << "SCEV: " << *AR << "\n";
+ });
+ return std::nullopt;
+ }
}
// Check the step is constant.
@@ -2365,8 +2409,9 @@ bool LoopAccessInfo::canAnalyzeLoop() {
<< TheLoop->getHeader()->getParent()->getName() << "' from "
<< TheLoop->getLocStr() << "\n");
- // We can only analyze innermost loops.
- if (!TheLoop->isInnermost()) {
+ // We can only analyze innermost loops if no memory dependency checks
+ // are needed.
+ if (!TheLoop->isInnermost() && !TheLoop->isAnnotatedParallel()) {
LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
return false;
@@ -2587,6 +2632,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
return true;
}
+ assert(TheLoop->isInnermost());
+
for (LoadInst *LD : Loads) {
Value *Ptr = LD->getPointerOperand();
// If we did *not* see this pointer before, insert it to the
@@ -2812,7 +2859,7 @@ bool LoopAccessInfo::isInvariant(Value *V) const {
if (!SE->isSCEVable(V->getType()))
return false;
const SCEV *S = SE->getSCEV(V);
- return SE->isLoopInvariant(S, TheLoop);
+ return isInvariantToTheLoop(TheLoop, *SE, S);
}
/// If \p Ptr is a GEP, which has a loop-variant operand, return that operand.
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll b/llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll
new file mode 100644
index 0000000000000..c71d821a7b0b6
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -disable-output -passes='print<access-info>' %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; The inner two loops of a naive matrix multiplication.
+; Not annotated as parallel, so the outer loop should not be analyzed.
+define void @outer_loop_not_parallel(i64 %N, i64 %M, ptr noalias %A, ptr %B, ptr %C) {
+; CHECK-LABEL: 'outer_loop_not_parallel'
+; CHECK-NEXT: inner.loop:
+; CHECK-NEXT: Memory dependences are safe
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+; CHECK-NEXT: loop.header:
+; CHECK-NEXT: Report: loop is not the innermost loop
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+ %M.is.zero = icmp eq i64 %M, 0
+ br i1 %M.is.zero, label %loop.latch, label %inner.loop
+
+inner.loop:
+ %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+ %a = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+ %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+ %b = load float, ptr %b.addr, align 4
+ %jxM = mul i64 %j, %M
+ %jxMpi = add i64 %jxM, %i
+ %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+ %c = load float, ptr %c.addr, align 4
+ %mul = fmul float %b, %c
+ %a.next = fadd float %a, %mul
+ %j.next = add nuw nsw i64 %j, 1
+ %inner.exitcond = icmp eq i64 %j.next, %M
+ br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+ %a.lcssa = phi float [ 0x0, %loop.header ], [ %a.next, %inner.loop ]
+ %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+ store float %a.lcssa, ptr %a.addr, align 4
+ %i.next = add nuw nsw i64 %i, 1
+ %loop.exitcond = icmp eq i64 %i.next, %N
+ br i1 %loop.exitcond, label %exit, label %loop.header
+
+exit:
+ ret void
+}
+
+
+; The inner two loops of a naive matrix multiplication.
+; The outer loop is annotated as parallel.
+define void @outer_loop_parallel(i64 %N, i64 %M, ptr noalias %A, ptr %B, ptr %C) {
+; CHECK-LABEL: 'outer_loop_parallel'
+; CHECK-NEXT: inner.loop:
+; CHECK-NEXT: Memory dependences are safe
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+; CHECK-NEXT: loop.header:
+; CHECK-NEXT: Memory dependences are safe
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+ %M.is.zero = icmp eq i64 %M, 0
+ br i1 %M.is.zero, label %loop.latch, label %inner.loop
+
+inner.loop:
+ %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+ %a = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+ %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+ %b = load float, ptr %b.addr, align 4, !llvm.access.group !1
+ %jxM = mul i64 %j, %M
+ %jxMpi = add i64 %jxM, %i
+ %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+ %c = load float, ptr %c.addr, align 4, !llvm.access.group !1
+ %mul = fmul float %b, %c
+ %a.next = fadd float %a, %mul
+ %j.next = add nuw nsw i64 %j, 1
+ %inner.exitcond = icmp eq i64 %j.next, %M
+ br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+ %a.lcssa = phi float [ 0x0, %loop.header ], [ %a.next, %inner.loop ]
+ %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+ store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !1
+ %i.next = add nuw nsw i64 %i, 1
+ %loop.exitcond = icmp eq i64 %i.next, %N
+ br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.parallel_accesses", !1}}
+!1 = distinct !{}
>From 78a89034e061cf16ba22e478ee3edeeb09b55362 Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Thu, 20 Feb 2025 21:38:40 +0100
Subject: [PATCH 5/5] [LV] Outer-loop vectorization in the default vectorizer
codepath
---
.../Vectorize/LoopVectorizationLegality.h | 4 +
.../Vectorize/LoopVectorizationLegality.cpp | 66 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 275 +++++-
.../Transforms/Vectorize/VPRecipeBuilder.h | 20 +-
llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 8 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 40 +-
.../outer-loop-vect-in-classic-path.ll | 831 ++++++++++++++++++
.../outer-loop-vect-in-classic-path.ll | 647 ++++++++++++++
9 files changed, 1843 insertions(+), 55 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index e959d93b57275..871a79d081719 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -642,6 +642,10 @@ class LoopVectorizationLegality {
/// Keep track of the loop edge to an uncountable exit, comprising a pair
/// of (Exiting, Exit) blocks, if there is exactly one early exit.
std::optional<std::pair<BasicBlock *, BasicBlock *>> UncountableEdge;
+
+ /// Contains true for a nested loop if it or any of its parents up
+ /// to the loop to vectorize needs a inner-loop active lane mask.
+ mutable DenseMap<const Loop *, bool> InnerLoopsNeedingPredication;
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 420cbc5384ce4..1b107179ba4ee 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -572,6 +572,11 @@ bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
if (VF.isScalar())
return true;
+ // The SCEVAddRecForUniformityRewriter does not support accesses to addresses
+ // invariant w.r.t. the vectorized loop but with recurrences of inner loops.
+ if (!TheLoop->isInnermost())
+ return false;
+
// Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
// never considered uniform.
auto *SE = PSE.getSE();
@@ -1207,8 +1212,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
});
}
- if (!LAI->canVectorizeMemory())
- return canVectorizeIndirectUnsafeDependences();
+ if (!LAI->canVectorizeMemory()) {
+ if (canVectorizeIndirectUnsafeDependences())
+ return true;
+
+ return false;
+ }
if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
reportVectorizationFailure("We don't allow storing to uniform addresses",
@@ -1403,7 +1412,31 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
"Uncountable exiting block must be a direct predecessor of latch");
return BB == Latch;
}
- return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+
+ if (LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT))
+ return true;
+
+ // Blocks in inner loops need predication if the inner loop trip-count
+ // is not invariant to the vectorized loop.
+ if (!TheLoop->isInnermost()) {
+ Loop *BBLoop = LI->getLoopFor(BB);
+ if (BBLoop != TheLoop) {
+ if (auto Iter = InnerLoopsNeedingPredication.find(BBLoop);
+ Iter != InnerLoopsNeedingPredication.end())
+ return Iter->second;
+
+ for (Loop *L = BBLoop; L != TheLoop; L = L->getParentLoop())
+ if (!isUniformLoop(L, TheLoop)) {
+ InnerLoopsNeedingPredication[BBLoop] = true;
+ return true;
+ }
+
+ InnerLoopsNeedingPredication[BBLoop] = false;
+ return false;
+ }
+ }
+
+ return false;
}
bool LoopVectorizationLegality::blockCanBePredicated(
@@ -1537,9 +1570,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
// Helper function to canVectorizeLoopNestCFG.
bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
bool UseVPlanNativePath) {
- assert((UseVPlanNativePath || Lp->isInnermost()) &&
- "VPlan-native path is not enabled.");
-
// TODO: ORE should be improved to show more accurate information when an
// outer loop can't be vectorized because a nested loop is not understood or
// legal. Something like: "outer_loop_location: loop not vectorized:
@@ -1573,6 +1603,23 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
return false;
}
+ if (Lp != TheLoop && !UseVPlanNativePath) {
+ // Inner loops must be in loop-simplify form with the latch block being
+ // also the only exiting block and a dedicated exit.
+ BasicBlock *Exiting = Lp->getExitingBlock();
+ if (!Lp->isLoopSimplifyForm() || !Exiting ||
+ Exiting != Lp->getLoopLatch() || !Lp->isLCSSAForm(*DT)) {
+ reportVectorizationFailure(
+ "The inner loops must exit through their latch",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+ }
+
return Result;
}
@@ -1775,9 +1822,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
// Specific checks for outer loops. We skip the remaining legal checks at this
// point because they don't support outer loops.
- if (!TheLoop->isInnermost()) {
- assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
-
+ if (!TheLoop->isInnermost() && UseVPlanNativePath) {
if (!canVectorizeOuterLoop()) {
reportVectorizationFailure("Unsupported outer loop",
"UnsupportedOuterLoop", ORE, TheLoop);
@@ -1790,7 +1835,6 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
return Result;
}
- assert(TheLoop->isInnermost() && "Inner loop expected.");
// Check if we can if-convert non-single-bb loops.
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
@@ -1811,7 +1855,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
}
if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
- if (TheLoop->getExitingBlock()) {
+ if (TheLoop->getExitingBlock() || !TheLoop->isInnermost()) {
reportVectorizationFailure("Cannot vectorize uncountable loop",
"UnsupportedUncountableLoop", ORE, TheLoop);
if (DoExtraAnalysis)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8a5db28ea0a4..555135a73ce28 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -58,6 +58,7 @@
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanAnalysis.h"
+#include "VPlanDominatorTree.h"
#include "VPlanHCFGBuilder.h"
#include "VPlanHelpers.h"
#include "VPlanPatternMatch.h"
@@ -401,6 +402,11 @@ static cl::opt<bool> EnableEarlyExitVectorization(
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));
+static cl::opt<bool> ExperimentalOLVInClassicPath(
+ "experimental-olv-in-classic-vect", cl::init(false), cl::Hidden,
+ cl::desc("Enable experimental outer-loop vectorization outside the "
+ "VPlan-native path."));
+
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
// variables not overflowing do not hold. See `emitSCEVChecks`.
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -1085,9 +1091,8 @@ class LoopVectorizationCostModel {
assert(VF.isVector() &&
"Profitable to scalarize relevant only for VF > 1.");
assert(
- TheLoop->isInnermost() &&
+ (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
"cost-model should not be used for outer loops (in VPlan-native path)");
-
auto Scalars = InstsToScalarize.find(VF);
assert(Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability");
@@ -1097,7 +1102,7 @@ class LoopVectorizationCostModel {
/// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
assert(
- TheLoop->isInnermost() &&
+ (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
"cost-model should not be used for outer loops (in VPlan-native path)");
// Pseudo probe needs to be duplicated for each unrolled iteration and
// vector lane so that profiled loop trip count can be accurately
@@ -1117,7 +1122,7 @@ class LoopVectorizationCostModel {
/// Returns true if \p I is known to be scalar after vectorization.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
assert(
- TheLoop->isInnermost() &&
+ (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
"cost-model should not be used for outer loops (in VPlan-native path)");
if (VF.isScalar())
return true;
@@ -1190,7 +1195,7 @@ class LoopVectorizationCostModel {
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
assert(VF.isVector() && "Expected VF to be a vector VF");
assert(
- TheLoop->isInnermost() &&
+ (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
"cost-model should not be used for outer loops (in VPlan-native path)");
std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
@@ -2205,7 +2210,7 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp,
return false;
}
- if (Hints.getInterleave() > 1) {
+ if (Hints.getInterleave() > 1 && EnableVPlanNativePath) {
// TODO: Interleave support is future work.
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n");
@@ -2224,7 +2229,8 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
// are stress testing the VPlan H-CFG construction, we collect the outermost
// loop of every loop nest.
if (L.isInnermost() || VPlanBuildStressTest ||
- (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
+ ((EnableVPlanNativePath || ExperimentalOLVInClassicPath) &&
+ isExplicitVecOuterLoop(&L, ORE))) {
LoopBlocksRPO RPOT(&L);
RPOT.perform(LI);
if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
@@ -2932,7 +2938,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Fix widened non-induction PHIs by setting up the PHI operands.
- if (EnableVPlanNativePath)
+ if (EnableVPlanNativePath || ExperimentalOLVInClassicPath)
fixNonInductionPHIs(State);
// After vectorization, the exit blocks of the original loop will have
@@ -3675,6 +3681,31 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
HasUniformUse.insert(Ptr);
}
+ if (!TheLoop->isInnermost()) {
+ SmallVector<Loop *> Loops(ArrayRef(TheLoop->getSubLoops()));
+ while (!Loops.empty()) {
+ auto *Lp = Loops.pop_back_val();
+ // Inner-loop inductions can be uniform, as well as their backedge value.
+ for (PHINode &Phi : Lp->getHeader()->phis())
+ if (Legal->isInvariant(&Phi)) {
+ AddToWorklistIfAllowed(&Phi);
+ auto *BackedgeVal = Phi.getIncomingValueForBlock(Lp->getLoopLatch());
+ assert(Legal->isInvariant(BackedgeVal));
+ if (auto *I = dyn_cast<Instruction>(BackedgeVal))
+ AddToWorklistIfAllowed(I);
+ }
+
+ // The exit condition of a inner loop can be uniform.
+ auto *Br = cast<BranchInst>(Lp->getLoopLatch()->getTerminator());
+ auto *ICmp = dyn_cast<ICmpInst>(Br->getCondition());
+ if (ICmp && Legal->isInvariant(ICmp->getOperand(0)) &&
+ Legal->isInvariant(ICmp->getOperand(1)))
+ AddToWorklistIfAllowed(ICmp);
+
+ Loops.append(Lp->getSubLoops().begin(), Lp->getSubLoops().end());
+ }
+ }
+
// Add to the worklist any operands which have *only* uniform (e.g. lane 0
// demanding) users. Since loops are assumed to be in LCSSA form, this
// disallows uses outside the loop as well.
@@ -6408,14 +6439,23 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
if (!Legal->isInvariant(Op))
return false;
+
// Consider Op invariant, if it or its operands aren't predicated
// instruction in the loop. In that case, it is not trivially hoistable.
auto *OpI = dyn_cast<Instruction>(Op);
- return !OpI || !TheLoop->contains(OpI) ||
- (!isPredicatedInst(OpI) &&
- (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
- all_of(OpI->operands(),
- [this](Value *Op) { return shouldConsiderInvariant(Op); }));
+ if (!OpI || !TheLoop->contains(OpI))
+ return true;
+
+ // Be pessimistic in case of inner loops and do not assume things are
+ // invariant. The approach below results in a endless loop in case a
+ // inner-loop header PHI is part of the operands.
+ if (!TheLoop->isInnermost())
+ return false;
+
+ return !isPredicatedInst(OpI) &&
+ (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
+ all_of(OpI->operands(),
+ [this](Value *Op) { return shouldConsiderInvariant(Op); });
}
InstructionCost
@@ -7134,7 +7174,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
}
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
- assert(OrigLoop->isInnermost() && "Inner loop expected.");
+ assert((OrigLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
+ "Inner loop expected.");
CM.collectValuesToIgnore();
CM.collectElementTypesForWidening();
@@ -7577,6 +7618,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
BestPlan.getVectorLoopRegion()->getSingleSuccessor() !=
BestPlan.getMiddleBlock();
assert((BestFactor.Width == LegacyVF.Width || PlanForEarlyExitLoop ||
+ ExperimentalOLVInClassicPath ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
CostCtx, OrigLoop) ||
planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
@@ -8265,7 +8307,7 @@ void VPRecipeBuilder::createHeaderMask() {
BlockMaskCache[Header] = BlockMask;
}
-VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
+VPValue *VPRecipeBuilder::getBlockInMask(const BasicBlock *BB) const {
// Return the cached value.
BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
assert(BCEntryIt != BlockMaskCache.end() &&
@@ -8986,7 +9028,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
- assert(OrigLoop->isInnermost() && "Inner loop expected.");
+ assert((OrigLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
+ "Inner loop expected.");
auto MaxVFTimes2 = MaxVF * 2;
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -9295,6 +9338,141 @@ static void addExitUsersForFirstOrderRecurrences(
}
}
+// Called before visiting the first instruction in the entry block
+// of the inner-loop region.
+static void enterInnerLoopRegion(VPlanHCFGBuilder &HCFGBuilder,
+ VPRecipeBuilder &RecipeBuilder,
+ VPRegionBlock &Region, ScalarEvolution &SE,
+ const Loop *TheLoop, const LoopInfo &LI) {
+ VPBasicBlock *Entry = Region.getEntryBasicBlock();
+ const Loop *InnerLoop = LI.getLoopFor(HCFGBuilder.getIRBBForVPB(Entry));
+ assert(InnerLoop->isLoopSimplifyForm() && InnerLoop->getNumBackEdges() == 1 &&
+ InnerLoop->getExitingBlock());
+
+ // Handle the inner-loop header phis.
+ const BasicBlock *IRPreheader = InnerLoop->getLoopPreheader();
+ for (VPRecipeBase &R : Entry->phis()) {
+ // TODO: If the phi has only uniform users (can happen for inner-loop
+ // inductions), then creating a scalar phi instead would be
+ // beneficial, or even a scalar and a widened phi in case the inner-loop
+ // induction has uniform and non-uniform users.
+ auto *Phi = cast<VPWidenPHIRecipe>(&R);
+ auto *IRPhi = cast<PHINode>(Phi->getUnderlyingValue());
+ Phi->setOperand(0, RecipeBuilder.getVPValueOrAddLiveIn(
+ IRPhi->getIncomingValueForBlock(IRPreheader)));
+
+ // This will ensure that this instruction is kept and not replaced when
+ // the entry block instructions are visited.
+ RecipeBuilder.setRecipe(IRPhi, Phi);
+ }
+
+ // Handle predication for the inner loop.
+ VPValue *PreheaderMask = RecipeBuilder.getBlockInMask(IRPreheader);
+ const SCEV *BTC = SE.getBackedgeTakenCount(InnerLoop);
+ bool NeedsActiveLaneMask =
+ !isa<SCEVCouldNotCompute>(BTC) && SE.isLoopInvariant(BTC, TheLoop);
+ if (NeedsActiveLaneMask) {
+ auto *InnerALM = new VPWidenPHIRecipe(nullptr);
+ if (!PreheaderMask)
+ PreheaderMask = Region.getPlan()->getOrAddLiveIn(
+ ConstantInt::getTrue(SE.getContext()));
+ // The backedge value will be filled in when the exit block of the
+ // region is visted.
+ InnerALM->addOperand(PreheaderMask);
+ InnerALM->insertBefore(*Entry, Entry->getFirstNonPhi());
+ RecipeBuilder.setBlockInMask(InnerLoop->getHeader(), InnerALM);
+ } else {
+ RecipeBuilder.setBlockInMask(InnerLoop->getHeader(), PreheaderMask);
+ }
+}
+
+// Called after the exiting block of the region is visited before
+// visiting the exit block.
+static void exitInnerLoopRegion(VPlanHCFGBuilder &HCFGBuilder,
+ VPRecipeBuilder &RecipeBuilder,
+ VPRegionBlock &Region) {
+
+ auto *Entry = Region.getEntryBasicBlock();
+ auto *Exiting = Region.getExitingBasicBlock();
+ const auto *IRHeader = HCFGBuilder.getIRBBForVPB(Entry);
+ const auto *IRBr =
+ cast<BranchInst>(HCFGBuilder.getIRBBForVPB(Exiting)->getTerminator());
+ bool ExitIfTrue = IRBr->getSuccessor(1) == IRHeader;
+
+ // Create the inner-loop exit condition and the backedge value for the
+ // inner-loop active-lane mask (if needed).
+ VPValue *ExitCond = RecipeBuilder.getVPValueOrAddLiveIn(IRBr->getCondition());
+ auto *ALM = dyn_cast_or_null<VPWidenPHIRecipe>(
+ RecipeBuilder.getBlockInMask(IRHeader));
+ VPBuilder Builder(Exiting, Exiting->end());
+ DebugLoc DL = IRBr->getDebugLoc();
+ if (ALM && ALM->getParent() == Entry) {
+ assert(!ALM->getUnderlyingValue() && ALM->getNumOperands() == 1);
+ if (ExitIfTrue)
+ ExitCond = Builder.createNot(ExitCond, DL);
+
+ auto *ALMBackedgeVal = Builder.createLogicalAnd(ALM, ExitCond, DL);
+ ALM->addOperand(ALMBackedgeVal);
+ auto *Any =
+ Builder.createNaryOp(VPInstruction::AnyOf, {ALMBackedgeVal}, DL);
+ ExitCond = Builder.createNot(Any, DL);
+ } else if (!ExitIfTrue) {
+ ExitCond = Builder.createNot(ExitCond, DL);
+ }
+ Builder.createNaryOp(VPInstruction::BranchOnCond, {ExitCond}, DL);
+
+ // Set the backedge values of the inner-loop header phis.
+ const auto *IRPreheader =
+ HCFGBuilder.getIRBBForVPB(Region.getSinglePredecessor());
+ for (VPRecipeBase &R : Entry->phis()) {
+ auto *Phi = cast<VPWidenPHIRecipe>(&R);
+ if (Phi == ALM)
+ continue;
+
+ auto *IRPhi = cast<PHINode>(Phi->getUnderlyingValue());
+ Phi->setOperand(1, RecipeBuilder.getVPValueOrAddLiveIn(
+ IRPhi->getIncomingValueForBlock(IRBr->getParent())));
+ }
+
+ // Handle the LCSSA phis for inner-loop live-out values.
+ auto *ExitBlock = cast<VPBasicBlock>(Region.getSingleSuccessor());
+ for (VPRecipeBase &R : ExitBlock->phis()) {
+ auto *Phi = cast<VPWidenPHIRecipe>(&R);
+ auto *IRPhi = cast<PHINode>(Phi->getUnderlyingValue());
+ assert(Phi->getNumOperands() == 1);
+ RecipeBuilder.setRecipe(IRPhi, Phi);
+ VPValue *OutVal =
+ RecipeBuilder.getVPValueOrAddLiveIn(IRPhi->getIncomingValue(0));
+ VPRecipeBase *OutValDef = OutVal->getDefiningRecipe();
+ if (OutValDef && OutValDef->getParent()->getParent() == &Region && ALM &&
+ ALM->getParent() == Entry) {
+ // In case there is a inner-loop active-lane mask, the live out value of
+ // the inner loop for a vector must contain the values of the last
+ // iteration where that lane was active. For this, a new phi is created
+ // that passes through the value from the last iteration if the lane is
+ // inactive and the current one if not.
+ auto *PassthroughPhi = new VPWidenPHIRecipe(IRPhi);
+ PassthroughPhi->addOperand(
+ Region.getPlan()->getOrAddLiveIn(PoisonValue::get(IRPhi->getType())));
+ PassthroughPhi->insertBefore(*Entry, Entry->getFirstNonPhi());
+
+ auto *Select =
+ new VPInstruction(Instruction::Select, {ALM, OutVal, PassthroughPhi},
+ OutValDef->getDebugLoc());
+ Select->insertAfter(OutValDef);
+
+ PassthroughPhi->addOperand(Select);
+ OutVal = Select;
+ }
+
+ Phi->setOperand(0, OutVal);
+ }
+
+ // The mask of the exit block should be that of the preheader.
+ RecipeBuilder.setBlockInMask(HCFGBuilder.getIRBBForVPB(ExitBlock),
+ RecipeBuilder.getBlockInMask(IRPreheader));
+}
+
VPlanPtr
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
@@ -9378,9 +9556,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+
BasicBlock *HeaderBB = OrigLoop->getHeader();
bool NeedsMasks =
- CM.foldTailByMasking() ||
+ CM.foldTailByMasking() || !OrigLoop->isInnermost() ||
any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
return Legal->blockNeedsPredication(BB) || NeedsBlends;
@@ -9392,12 +9571,30 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
- ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
HeaderVPBB);
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
VPBlockBase *PrevVPBB = nullptr;
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPBlockBase *VPBlock : RPOT) {
+ // Handle the entering into a new inner loop.
+ if (auto *Region = dyn_cast<VPRegionBlock>(VPBlock)) {
+ assert(ExperimentalOLVInClassicPath);
+ enterInnerLoopRegion(HCFGBuilder, RecipeBuilder, *Region, *PSE.getSE(),
+ OrigLoop, *LI);
+
+ // The inner-loop region can keep its successor connection and should be
+ // connected to its RPO predecessor, but when visiting the entry block of
+ // the inner loop, there should be no connection to the RPO predecessor.
+ assert(Region->getNumSuccessors() == 1 && PrevVPBB &&
+ "Invalid inner loop (expected preheader and dedicated exit)");
+ VPBlockUtils::connectBlocks(PrevVPBB, Region);
+ PrevVPBB = nullptr;
+ continue;
+ }
+
+ VPBasicBlock *VPBB = cast<VPBasicBlock>(VPBlock);
+
// Handle VPBBs down to the latch.
if (VPBB == LoopRegion->getExiting()) {
assert(!HCFGBuilder.getIRBBForVPB(VPBB) &&
@@ -9409,7 +9606,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Create mask based on the IR BB corresponding to VPBB.
// TODO: Predicate directly based on VPlan.
Builder.setInsertPoint(VPBB, VPBB->begin());
- if (VPBB == HeaderVPBB) {
+ if (RecipeBuilder.hasBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB))) {
+ Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
+ } else if (VPBB == HeaderVPBB) {
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
RecipeBuilder.createHeaderMask();
} else if (NeedsMasks) {
@@ -9429,7 +9628,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// FIXME: Migrate code relying on the underlying instruction from VPlan0
// to construct recipes below to not use the underlying instruction.
if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
- (isa<VPInstruction>(&R) && !UnderlyingValue))
+ (isa<VPInstruction>(&R) && !UnderlyingValue) ||
+ (isa<VPWidenPHIRecipe>(&R) &&
+ (!UnderlyingValue ||
+ RecipeBuilder.hasRecipe(cast<Instruction>(UnderlyingValue)))))
continue;
// FIXME: VPlan0, which models a copy of the original scalar loop, should
@@ -9451,6 +9653,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
Builder.setInsertPoint(SingleDef);
SmallVector<VPValue *, 4> Operands;
auto *Phi = dyn_cast<PHINode>(Instr);
+ if (Phi && RecipeBuilder.hasRecipe(Phi))
+ // Skip over LCSSA or inner-loop header phis.
+ continue;
+
if (Phi && Phi->getParent() == HeaderBB) {
// The backedge value will be added in fixHeaderPhis later.
Operands.push_back(Plan->getOrAddLiveIn(
@@ -9498,6 +9704,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
R.eraseFromParent();
}
+ // Handle the exit of a inner loop region.
+ if (auto *Region = VPBB->getParent();
+ Region && Region->getExiting() == VPBB) {
+ exitInnerLoopRegion(HCFGBuilder, RecipeBuilder, *Region);
+
+ if (PrevVPBB)
+ VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+
+ // The region will already be connected to its single successor.
+ assert(Region->getNumSuccessors() == 1 && VPBB->getNumSuccessors() == 0);
+ PrevVPBB = nullptr;
+ continue;
+ }
+
// Flatten the CFG in the loop. Masks for blocks have already been generated
// and added to recipes as needed. To do so, first disconnect VPBB from its
// successors. Then connect VPBB to the previously visited VPBB.
@@ -10460,9 +10680,6 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
}
bool LoopVectorizePass::processLoop(Loop *L) {
- assert((EnableVPlanNativePath || L->isInnermost()) &&
- "VPlan-native path is not enabled. Only process inner loops.");
-
LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
<< L->getHeader()->getParent()->getName() << "' from "
<< L->getLocStr() << "\n");
@@ -10520,11 +10737,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// even evaluating whether vectorization is profitable. Since we cannot modify
// the incoming IR, we need to build VPlan upfront in the vectorization
// pipeline.
- if (!L->isInnermost())
+ //
+ // The normal vectorization codepath now also has experimental support for
+ // outer-loop vectorization.
+ if (!L->isInnermost() && EnableVPlanNativePath)
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
ORE, BFI, PSI, Hints, Requirements);
- assert(L->isInnermost() && "Inner loop expected.");
+ assert((L->isInnermost() || ExperimentalOLVInClassicPath) &&
+ "Inner loop expected.");
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
@@ -10534,7 +10755,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
UseInterleaved = EnableInterleavedMemAccesses;
// Analyze interleaved memory accesses.
- if (UseInterleaved)
+ if (UseInterleaved && L->isInnermost())
IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
if (LVL.hasUncountableEarlyExit()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index e8d3ad89e14cf..464f43927f780 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -196,7 +196,17 @@ class VPRecipeBuilder {
void createBlockInMask(BasicBlock *BB);
/// Returns the *entry* mask for the block \p BB.
- VPValue *getBlockInMask(BasicBlock *BB) const;
+ VPValue *getBlockInMask(const BasicBlock *BB) const;
+
+ /// Returns true if there already is a block-in mask for \p BB.
+ bool hasBlockInMask(BasicBlock *BB) const {
+ return BlockMaskCache.contains(BB);
+ }
+
+ /// Set the block-in mask of \p BB directly.
+ void setBlockInMask(BasicBlock *BB, VPValue *Mask) {
+ BlockMaskCache[BB] = Mask;
+ }
/// Create an edge mask for every destination of cases and/or default.
void createSwitchEdgeMasks(SwitchInst *SI);
@@ -225,6 +235,14 @@ class VPRecipeBuilder {
ArrayRef<VPValue *> Operands,
VFRange &Range);
+ /// Return true if there already is a recipe for the given ingredient.
+ bool hasRecipe(Instruction *I) const { return Ingredient2Recipe.contains(I); }
+
+ /// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as
+ /// last operand. Range.End may be decreased to ensure same recipe behavior
+ /// from \p Range.Start to \p Range.End.
+ VPReplicateRecipe *handleReplication(Instruction *I, VFRange &Range);
+
/// Add the incoming values from the backedge to reduction & first-order
/// recurrence cross-iteration phis.
void fixHeaderPhis();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index cd111365c134c..ac8823df0c2f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -970,7 +970,12 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
// FIXME: Model VF * UF computation completely in VPlan.
- assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) &&
+ // When outer-loop vectorizing and the trip-count is known, it is possible
+ // that VPlanTransforms::optimizeForVFAndUF() destroys the vector loop region,
+ // but getVectorLoopRegion() will falsely return the inner loop region.
+ assert((!getVectorLoopRegion() || VFxUF.getNumUsers() ||
+ !State.LI->getLoopFor(getScalarHeader()->getIRBasicBlock())
+ ->isInnermost()) &&
"VFxUF expected to always have users");
unsigned UF = getUF();
if (VF.getNumUsers()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 15e90bc18bc87..f0786d3d9e529 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1994,11 +1994,17 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe {
VPSlotTracker &SlotTracker) const override;
#endif
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
/// Returns the \p I th incoming VPBasicBlock.
VPBasicBlock *getIncomingBlock(unsigned I);
/// Returns the \p I th incoming VPValue.
- VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
+ VPValue *getIncomingValue(unsigned I) const { return getOperand(I); }
+
+ /// Return the incoming VPValue for the predecessor \p BB.
+ VPValue *getIncomingValueForBlock(const VPBasicBlock *BB) const;
};
/// A recipe for handling first-order recurrence phis. The start value is the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d57a6c481748c..42a918e8c76d9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3642,10 +3642,25 @@ VPBasicBlock *VPWidenPHIRecipe::getIncomingBlock(unsigned I) {
return Pred->getExitingBasicBlock();
}
-void VPWidenPHIRecipe::execute(VPTransformState &State) {
- assert(EnableVPlanNativePath &&
- "Non-native vplans are not expected to have VPWidenPHIRecipes.");
+VPValue *
+VPWidenPHIRecipe::getIncomingValueForBlock(const VPBasicBlock *BB) const {
+ const VPBasicBlock *Parent = getParent();
+ const VPRegionBlock *Region = Parent->getParent();
+ if (Region && Region->getEntryBasicBlock() == Parent) {
+ if (Region->getSinglePredecessor() == BB)
+ return getOperand(0);
+ if (Region->getExitingBasicBlock() == BB)
+ return getOperand(1);
+ }
+
+ for (unsigned I = 0; I < Parent->getNumPredecessors(); I++)
+ if (Parent->getPredecessors()[I] == BB)
+ return getOperand(I);
+ return nullptr;
+}
+
+void VPWidenPHIRecipe::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
Value *Op0 = State.get(getOperand(0));
Type *VecTy = Op0->getType();
@@ -3657,23 +3672,20 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) {
void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-PHI ";
-
- auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
- // Unless all incoming values are modeled in VPlan print the original PHI
- // directly.
- // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
- // values as VPValues.
- if (getNumOperands() != OriginalPhi->getNumOperands()) {
- O << VPlanIngredient(OriginalPhi);
- return;
- }
-
printAsOperand(O, SlotTracker);
O << " = phi ";
printOperands(O, SlotTracker);
}
#endif
+InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ if (getNumOperands() == 1)
+ return 0; // LCSSA Phis can be considered free.
+
+ return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+}
+
// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
// remove VPActiveLaneMaskPHIRecipe.
void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll
new file mode 100644
index 0000000000000..bed6c3ece93a6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll
@@ -0,0 +1,831 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=aarch64 -mattr=+sve -passes=loop-vectorize,instcombine,simplifycfg \
+; RUN: -force-vector-interleave=1 -experimental-olv-in-classic-vect \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck -check-prefix=CHECK-IC1 %s
+; RUN: opt -S -mtriple=aarch64 -mattr=+sve -passes=loop-vectorize,instcombine,simplifycfg \
+; RUN: -force-vector-interleave=2 -experimental-olv-in-classic-vect \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck -check-prefix=CHECK-IC2 %s
+
+;;; Effectively the inner two loops of:
+; for (size_t i = 0; i < N; i++) {
+; #pragma clang loop vectorize(enable)
+; for (size_t j = 0; j < N; j++) {
+; float a = 0.;
+; for (size_t k = 0; k < N; k++)
+; a += B[i][k] * C[k][j];
+; A[i][j] = a;
+; }
+; }
+define void @foo(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) {
+; CHECK-IC1-LABEL: define void @foo(
+; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-IC1-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-IC1: [[VECTOR_BODY]]:
+; CHECK-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH9:.*]] ]
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH9]] ]
+; CHECK-IC1-NEXT: br label %[[INNER_LOOP1:.*]]
+; CHECK-IC1: [[INNER_LOOP1]]:
+; CHECK-IC1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT12:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP13:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK]], %[[VECTOR_BODY]] ], [ [[TMP19:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_BODY]] ], [ [[TMP14:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT: [[TMP5:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
+; CHECK-IC1-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[TMP7]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP8:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], [[M]]
+; CHECK-IC1-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP9]]
+; CHECK-IC1-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i64 [[INDEX]]
+; CHECK-IC1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[VEC_PHI3]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[TMP12:%.*]] = fmul <vscale x 4 x float> [[BROADCAST_SPLAT]], [[WIDE_MASKED_LOAD]]
+; CHECK-IC1-NEXT: [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI2]], [[TMP12]]
+; CHECK-IC1-NEXT: [[TMP14]] = select <vscale x 4 x i1> [[VEC_PHI3]], <vscale x 4 x float> [[TMP13]], <vscale x 4 x float> [[VEC_PHI4]]
+; CHECK-IC1-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[TMP15]], 1
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP16]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT12]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT11]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP16]], [[M]]
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP17]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT6]], splat (i1 true)
+; CHECK-IC1-NEXT: [[TMP19]] = select <vscale x 4 x i1> [[VEC_PHI3]], <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP19]])
+; CHECK-IC1-NEXT: br i1 [[TMP20]], label %[[INNER_LOOP1]], label %[[LOOP_LATCH9]]
+; CHECK-IC1: [[LOOP_LATCH9]]:
+; CHECK-IC1-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC1-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP14]], ptr [[TMP21]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC1-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC1-NEXT: br i1 [[TMP22]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-IC1: [[EXIT]]:
+; CHECK-IC1-NEXT: ret void
+;
+; CHECK-IC2-LABEL: define void @foo(
+; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-IC2-NEXT: [[ENTRY:.*]]:
+; CHECK-IC2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
+; CHECK-IC2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
+; CHECK-IC2-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP6]], i64 [[N]])
+; CHECK-IC2-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-IC2: [[VECTOR_BODY]]:
+; CHECK-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH22:.*]] ]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH22]] ]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT25:%.*]], %[[LOOP_LATCH22]] ]
+; CHECK-IC2-NEXT: br label %[[INNER_LOOP3:.*]]
+; CHECK-IC2: [[INNER_LOOP3]]:
+; CHECK-IC2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT27:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT29:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP21:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP22:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK]], %[[VECTOR_BODY]] ], [ [[TMP33:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK2]], %[[VECTOR_BODY]] ], [ [[TMP34:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI9:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_BODY]] ], [ [[TMP23:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI10:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_BODY]] ], [ [[TMP24:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[TMP40:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP40]]
+; CHECK-IC2-NEXT: [[TMP41:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI4]], i64 0
+; CHECK-IC2-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]]
+; CHECK-IC2-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP7]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[TMP14]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP42]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <vscale x 4 x float> poison, float [[TMP12]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT12]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT: [[TMP9:%.*]] = mul i64 [[TMP13]], [[M]]
+; CHECK-IC2-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP9]]
+; CHECK-IC2-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i64 [[INDEX]]
+; CHECK-IC2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[DOTIDX:%.*]] = shl i64 [[TMP17]], 4
+; CHECK-IC2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[DOTIDX]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[VEC_PHI7]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[VEC_PHI8]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[TMP19:%.*]] = fmul <vscale x 4 x float> [[BROADCAST_SPLAT]], [[WIDE_MASKED_LOAD]]
+; CHECK-IC2-NEXT: [[TMP20:%.*]] = fmul <vscale x 4 x float> [[BROADCAST_SPLAT13]], [[WIDE_MASKED_LOAD11]]
+; CHECK-IC2-NEXT: [[TMP21]] = fadd <vscale x 4 x float> [[VEC_PHI5]], [[TMP19]]
+; CHECK-IC2-NEXT: [[TMP22]] = fadd <vscale x 4 x float> [[VEC_PHI6]], [[TMP20]]
+; CHECK-IC2-NEXT: [[TMP23]] = select <vscale x 4 x i1> [[VEC_PHI7]], <vscale x 4 x float> [[TMP21]], <vscale x 4 x float> [[VEC_PHI9]]
+; CHECK-IC2-NEXT: [[TMP24]] = select <vscale x 4 x i1> [[VEC_PHI8]], <vscale x 4 x float> [[TMP22]], <vscale x 4 x float> [[VEC_PHI10]]
+; CHECK-IC2-NEXT: [[TMP25:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT: [[TMP26:%.*]] = add nuw nsw i64 [[TMP25]], 1
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP26]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT27]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT26]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP27:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI4]], i64 0
+; CHECK-IC2-NEXT: [[TMP28:%.*]] = add nuw nsw i64 [[TMP27]], 1
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP28]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT29]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT28]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP29:%.*]] = icmp eq i64 [[TMP26]], [[M]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP29]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT14]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[M]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP30]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT16]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP31:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT15]], splat (i1 true)
+; CHECK-IC2-NEXT: [[TMP32:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
+; CHECK-IC2-NEXT: [[TMP33]] = select <vscale x 4 x i1> [[VEC_PHI7]], <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP34]] = select <vscale x 4 x i1> [[VEC_PHI8]], <vscale x 4 x i1> [[TMP32]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP33]])
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP35]], i64 0
+; CHECK-IC2-NEXT: [[TMP36:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP34]])
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP36]], i64 0
+; CHECK-IC2-NEXT: [[TMP37:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLATINSERT18]], [[BROADCAST_SPLATINSERT20]]
+; CHECK-IC2-NEXT: [[TMP38:%.*]] = shufflevector <vscale x 4 x i1> [[TMP37]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP39:%.*]] = extractelement <vscale x 4 x i1> [[TMP38]], i64 0
+; CHECK-IC2-NEXT: br i1 [[TMP39]], label %[[INNER_LOOP3]], label %[[LOOP_LATCH22]]
+; CHECK-IC2: [[LOOP_LATCH22]]:
+; CHECK-IC2-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC2-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[DOTIDX30:%.*]] = shl i64 [[TMP48]], 4
+; CHECK-IC2-NEXT: [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[TMP47]], i64 [[DOTIDX30]]
+; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP23]], ptr [[TMP47]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP24]], ptr [[TMP49]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC2-NEXT: [[TMP43:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP44:%.*]] = shl i64 [[TMP43]], 2
+; CHECK-IC2-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], [[TMP44]]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT25]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP45]], i64 [[TMP4]])
+; CHECK-IC2-NEXT: [[TMP46:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC2-NEXT: br i1 [[TMP46]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-IC2: [[EXIT]]:
+; CHECK-IC2-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+ br label %inner.loop
+
+inner.loop:
+ %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+ %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+ %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+ %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3
+ %jxM = mul i64 %j, %M
+ %jxMpi = add i64 %jxM, %i
+ %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+ %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3
+ %mul = fmul float %b.load, %c.load
+ %a.next = fadd float %a.phi, %mul
+ %j.next = add nuw nsw i64 %j, 1
+ %inner.exitcond = icmp eq i64 %j.next, %M
+ br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+ %a.lcssa = phi float [ %a.next, %inner.loop ]
+ %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+ store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3
+ %i.next = add nuw nsw i64 %i, 1
+ %loop.exitcond = icmp eq i64 %i.next, %N
+ br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+;;; Effectively the inner two loops of:
+; for (size_t i = 0; i < N; i++) {
+; #pragma clang loop vectorize(enable)
+; for (size_t j = 0; j < N; j++) {
+; float a = 0.;
+; for (size_t k = 0; k < j; k++)
+; a += B[i][k] * C[k][j];
+; A[i][j] = a;
+; }
+; }
+;;; Note that the inner loop's trip-count depends on the outer loop.
+define void @bar(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) {
+; CHECK-IC1-LABEL: define void @bar(
+; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0]] {
+; CHECK-IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-IC1-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-IC1: [[VECTOR_BODY]]:
+; CHECK-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH3:.*]] ]
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH3]] ]
+; CHECK-IC1-NEXT: br label %[[INNER_LOOP1:.*]]
+; CHECK-IC1: [[INNER_LOOP1]]:
+; CHECK-IC1-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT6:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP13:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT: [[TMP6:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI1]], i64 0
+; CHECK-IC1-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP7]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[TMP8:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI1]], i64 0
+; CHECK-IC1-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], [[M]]
+; CHECK-IC1-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP9]]
+; CHECK-IC1-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i64 [[INDEX]]
+; CHECK-IC1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[TMP12:%.*]] = fmul <vscale x 4 x float> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]]
+; CHECK-IC1-NEXT: [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI3]], [[TMP12]]
+; CHECK-IC1-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI1]], i64 0
+; CHECK-IC1-NEXT: [[TMP21:%.*]] = add i64 [[TMP15]], 1
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP21]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT6]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP21]], [[INDEX]]
+; CHECK-IC1-NEXT: br i1 [[TMP14]], label %[[LOOP_LATCH3]], label %[[INNER_LOOP1]]
+; CHECK-IC1: [[LOOP_LATCH3]]:
+; CHECK-IC1-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC1-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP13]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC1-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC1-NEXT: br i1 [[TMP20]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-IC1: [[EXIT]]:
+; CHECK-IC1-NEXT: ret void
+;
+; CHECK-IC2-LABEL: define void @bar(
+; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0]] {
+; CHECK-IC2-NEXT: [[ENTRY:.*]]:
+; CHECK-IC2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 3
+; CHECK-IC2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 3
+; CHECK-IC2-NEXT: [[TMP5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
+; CHECK-IC2-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 2
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 [[N]])
+; CHECK-IC2-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-IC2: [[VECTOR_BODY]]:
+; CHECK-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH11:.*]] ]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH11]] ]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT22:%.*]], %[[LOOP_LATCH11]] ]
+; CHECK-IC2-NEXT: br label %[[INNER_LOOP3:.*]]
+; CHECK-IC2: [[INNER_LOOP3]]:
+; CHECK-IC2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT16:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT18:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP29:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP30:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP12]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP9:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI4]], i64 0
+; CHECK-IC2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP10]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[TMP11:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP11]], [[M]]
+; CHECK-IC2-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP14]]
+; CHECK-IC2-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[INDEX]]
+; CHECK-IC2-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[DOTIDX:%.*]] = shl i64 [[TMP17]], 4
+; CHECK-IC2-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[DOTIDX]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]]
+; CHECK-IC2-NEXT: [[TMP20:%.*]] = fmul <vscale x 4 x float> [[WIDE_MASKED_GATHER13]], [[WIDE_MASKED_LOAD14]]
+; CHECK-IC2-NEXT: [[TMP29]] = fadd <vscale x 4 x float> [[VEC_PHI5]], [[TMP19]]
+; CHECK-IC2-NEXT: [[TMP30]] = fadd <vscale x 4 x float> [[VEC_PHI6]], [[TMP20]]
+; CHECK-IC2-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT: [[TMP22:%.*]] = add i64 [[TMP21]], 1
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP22]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT16]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT15]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP23:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI4]], i64 0
+; CHECK-IC2-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], 1
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP24]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT18]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT17]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[TMP22]], [[INDEX]]
+; CHECK-IC2-NEXT: br i1 [[TMP25]], label %[[LOOP_LATCH11]], label %[[INNER_LOOP3]]
+; CHECK-IC2: [[LOOP_LATCH11]]:
+; CHECK-IC2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC2-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[DOTIDX19:%.*]] = shl i64 [[TMP27]], 4
+; CHECK-IC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[DOTIDX19]]
+; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP29]], ptr [[TMP26]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP30]], ptr [[TMP28]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-IC2-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP40:%.*]] = shl i64 [[TMP39]], 2
+; CHECK-IC2-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP5]])
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT22]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP41]], i64 [[TMP5]])
+; CHECK-IC2-NEXT: [[TMP42:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC2-NEXT: br i1 [[TMP42]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-IC2: [[EXIT]]:
+; CHECK-IC2-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+ br label %inner.loop
+
+inner.loop:
+ %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+ %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+ %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+ %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3
+ %jxM = mul i64 %j, %M
+ %jxMpi = add i64 %jxM, %i
+ %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+ %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3
+ %mul = fmul float %b.load, %c.load
+ %a.next = fadd float %a.phi, %mul
+ %j.next = add nuw nsw i64 %j, 1
+ %inner.exitcond = icmp eq i64 %j.next, %i
+ br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+ %a.lcssa = phi float [ %a.next, %inner.loop ]
+ %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+ store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3
+ %i.next = add nuw nsw i64 %i, 1
+ %loop.exitcond = icmp eq i64 %i.next, %N
+ br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+;;; Effectively something like:
+; #pragma clang loop vectorize(enable)
+; for (long i = 0; i < N; i++) {
+; long a = A[i];
+; long j = 0;
+; if (a > 0) {
+; do {
+; a -= B[j];
+; j++;
+; } while (a > 0);
+; }
+; A[i] = a + j;
+; }
+;;; Note that the inner loop is behind a branch, so the start value of the inner
+;;; loop mask phi must be corespondingly. The induction of the inner loop is used
+;;; for a uniform memory accesses and as live-out, so the vectorized code should
+;;; contain two phis for it (one scalar and one widened).
+;;; Also, in this example, the inner loop backedge is the first successor of the
+;;; the latch terminator, not the second one as is assumed by VPlan.
+define void @baz(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B) {
+; CHECK-IC1-LABEL: define void @baz(
+; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] {
+; CHECK-IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-IC1-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
+; CHECK-IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-IC1: [[VECTOR_BODY]]:
+; CHECK-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT3:.*]] ]
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT3]] ]
+; CHECK-IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[TMP6:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-IC1-NEXT: [[TMP7:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
+; CHECK-IC1-NEXT: br label %[[INNER_LOOP1:.*]]
+; CHECK-IC1: [[INNER_LOOP1]]:
+; CHECK-IC1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT6:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP9:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT: [[TMP10:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP10]]
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP8]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[TMP9]] = sub <vscale x 2 x i64> [[VEC_PHI3]], [[WIDE_MASKED_GATHER]]
+; CHECK-IC1-NEXT: [[J2:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT: [[TMP11:%.*]] = add nuw nsw i64 [[J2]], 1
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT6]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP13:%.*]] = extractelement <vscale x 2 x i64> [[TMP9]], i64 0
+; CHECK-IC1-NEXT: [[TMP12:%.*]] = icmp slt i64 [[TMP13]], 1
+; CHECK-IC1-NEXT: br i1 [[TMP12]], label %[[LOOP_LATCH_LOOPEXIT3]], label %[[INNER_LOOP1]]
+; CHECK-IC1: [[LOOP_LATCH_LOOPEXIT3]]:
+; CHECK-IC1-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
+; CHECK-IC1-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC1-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC1-NEXT: br i1 [[TMP15]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-IC1: [[EXIT]]:
+; CHECK-IC1-NEXT: ret void
+;
+; CHECK-IC2-LABEL: define void @baz(
+; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] {
+; CHECK-IC2-NEXT: [[ENTRY:.*]]:
+; CHECK-IC2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-IC2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-IC2-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 1
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 [[N]])
+; CHECK-IC2-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-IC2: [[VECTOR_BODY]]:
+; CHECK-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT11:.*]] ]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT11]] ]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], %[[LOOP_LATCH_LOOPEXIT11]] ]
+; CHECK-IC2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC2-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[DOTIDX:%.*]] = shl i64 [[TMP8]], 4
+; CHECK-IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[DOTIDX]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[TMP10:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-IC2-NEXT: [[TMP11:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], zeroinitializer
+; CHECK-IC2-NEXT: [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer
+; CHECK-IC2-NEXT: br label %[[INNER_LOOP4:.*]]
+; CHECK-IC2: [[INNER_LOOP4]]:
+; CHECK-IC2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT17:%.*]], %[[INNER_LOOP4]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT19:%.*]], %[[INNER_LOOP4]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP18:%.*]], %[[INNER_LOOP4]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI7:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP19:%.*]], %[[INNER_LOOP4]] ]
+; CHECK-IC2-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP14]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP15]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP16:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI5]], i64 0
+; CHECK-IC2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP16]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP17]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT8]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT9]], i32 8, <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[TMP18]] = sub <vscale x 2 x i64> [[VEC_PHI6]], [[WIDE_MASKED_GATHER]]
+; CHECK-IC2-NEXT: [[TMP19]] = sub <vscale x 2 x i64> [[VEC_PHI7]], [[WIDE_MASKED_GATHER10]]
+; CHECK-IC2-NEXT: [[J6:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT: [[TMP20:%.*]] = add nuw nsw i64 [[J6]], 1
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT17]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT16]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI5]], i64 0
+; CHECK-IC2-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[TMP22]], 1
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP23]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT19]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT18]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP24:%.*]] = extractelement <vscale x 2 x i64> [[TMP18]], i64 0
+; CHECK-IC2-NEXT: [[TMP25:%.*]] = icmp slt i64 [[TMP24]], 1
+; CHECK-IC2-NEXT: br i1 [[TMP25]], label %[[LOOP_LATCH_LOOPEXIT11]], label %[[INNER_LOOP4]]
+; CHECK-IC2: [[LOOP_LATCH_LOOPEXIT11]]:
+; CHECK-IC2-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i64> [[TMP18]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
+; CHECK-IC2-NEXT: [[PREDPHI14:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> [[TMP19]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]]
+; CHECK-IC2-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[DOTIDX20:%.*]] = shl i64 [[TMP26]], 4
+; CHECK-IC2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[DOTIDX20]]
+; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI14]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC2-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP29:%.*]] = shl i64 [[TMP28]], 1
+; CHECK-IC2-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], [[TMP29]]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP30]], i64 [[TMP4]])
+; CHECK-IC2-NEXT: [[TMP31:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC2-NEXT: br i1 [[TMP31]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-IC2: [[EXIT]]:
+; CHECK-IC2-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+ %a.addr = getelementptr inbounds i64, ptr %A, i64 %i
+ %a.load = load i64, ptr %a.addr, align 8, !llvm.access.group !3
+ %a.is.positive = icmp sgt i64 %a.load, 0
+ br i1 %a.is.positive, label %inner.loop, label %loop.latch
+
+inner.loop:
+ %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+ %a.phi = phi i64 [ %a.next, %inner.loop ], [ 0, %loop.header ]
+ %b.addr = getelementptr inbounds i64, ptr %B, i64 %j
+ %b.load = load i64, ptr %b.addr, align 8, !llvm.access.group !3
+ %a.next = sub i64 %a.phi, %b.load
+ %j.next = add nuw nsw i64 %j, 1
+ %a.is.still.positive = icmp sgt i64 %a.next, 0
+ br i1 %a.is.still.positive, label %inner.loop, label %loop.latch
+
+loop.latch:
+ %a.res = phi i64 [ %a.load, %loop.header ], [ %a.next, %inner.loop ]
+ store i64 %a.res, ptr %a.addr, align 8, !llvm.access.group !3
+ %i.next = add nuw nsw i64 %i, 1
+ %loop.exitcond = icmp eq i64 %i.next, %N
+ br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+;;; Triple-loop nest with the outer-most one beeing vectorized.
+; #pragma clang loop vectorize(enable)
+; for (size_t i = 0; i < N; i++)
+; for (size_t j = 0; j < M; j++)
+; for (size_t k = 0; k < L; k++)
+; A[k][i] += B[i][k];
+define void @quuz(i64 %N, i64 %M, i64 %L, ptr noalias %A, ptr readonly %B) {
+; CHECK-IC1-LABEL: define void @quuz(
+; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], i64 [[L:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] {
+; CHECK-IC1-NEXT: [[ENTRY:.*:]]
+; CHECK-IC1-NEXT: [[N_IS_ZERO:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-IC1-NEXT: br i1 [[N_IS_ZERO]], label %[[EXIT:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-IC1: [[VECTOR_PH]]:
+; CHECK-IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-IC1-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[M]], i64 0
+; CHECK-IC1-NEXT: [[TMP5:%.*]] = icmp eq <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], zeroinitializer
+; CHECK-IC1-NEXT: [[TMP6:%.*]] = shufflevector <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP11:%.*]] = xor <vscale x 4 x i1> [[TMP6]], splat (i1 true)
+; CHECK-IC1-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP1]], i64 0
+; CHECK-IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[L]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-IC1: [[VECTOR_BODY]]:
+; CHECK-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT17:.*]] ]
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT17]] ]
+; CHECK-IC1-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP7]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT17]] ]
+; CHECK-IC1-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC1-NEXT: [[TMP9:%.*]] = mul <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-IC1-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[B]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-IC1-NEXT: [[TMP12:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC1-NEXT: br label %[[MIDDLE_LOOP3:.*]]
+; CHECK-IC1: [[MIDDLE_LOOP3]]:
+; CHECK-IC1-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT19:%.*]], %[[MIDDLE_LATCH_LOOPEXIT12:.*]] ]
+; CHECK-IC1-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i1> [ [[TMP12]], %[[VECTOR_BODY]] ], [ [[TMP27:%.*]], %[[MIDDLE_LATCH_LOOPEXIT12]] ]
+; CHECK-IC1-NEXT: [[TMP13:%.*]] = icmp ne <vscale x 4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer
+; CHECK-IC1-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[VEC_PHI1]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC1-NEXT: br label %[[INNER_LOOP5:.*]]
+; CHECK-IC1: [[INNER_LOOP5]]:
+; CHECK-IC1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[MIDDLE_LOOP3]] ], [ [[BROADCAST_SPLAT10:%.*]], %[[INNER_LOOP5]] ]
+; CHECK-IC1-NEXT: [[TMP14:%.*]] = phi <vscale x 4 x i1> [ [[TMP24]], %[[MIDDLE_LOOP3]] ], [ [[TMP25:%.*]], %[[INNER_LOOP5]] ]
+; CHECK-IC1-NEXT: [[K6:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT: [[TMP15:%.*]] = mul i64 [[K6]], [[N]]
+; CHECK-IC1-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[TMP15]]
+; CHECK-IC1-NEXT: [[TMP17:%.*]] = getelementptr float, <vscale x 4 x ptr> [[TMP10]], <vscale x 4 x i64> [[VEC_PHI]]
+; CHECK-IC1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP17]], i32 4, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-IC1-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP18]], ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP14]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT: [[TMP31:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT: [[TMP19:%.*]] = add i64 [[TMP31]], 1
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP19]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT10]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[L]]
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP20]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT10]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP29:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT11]], splat (i1 true)
+; CHECK-IC1-NEXT: [[TMP25]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP29]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP30:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP25]])
+; CHECK-IC1-NEXT: br i1 [[TMP30]], label %[[INNER_LOOP5]], label %[[MIDDLE_LATCH_LOOPEXIT12]]
+; CHECK-IC1: [[MIDDLE_LATCH_LOOPEXIT12]]:
+; CHECK-IC1-NEXT: [[J4:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI2]], i64 0
+; CHECK-IC1-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[J4]], 1
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP21]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT19]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT18]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], [[M]]
+; CHECK-IC1-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP22]], i64 0
+; CHECK-IC1-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT14]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP26:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT15]], splat (i1 true)
+; CHECK-IC1-NEXT: [[TMP27]] = select <vscale x 4 x i1> [[VEC_PHI1]], <vscale x 4 x i1> [[TMP26]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC1-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP27]])
+; CHECK-IC1-NEXT: br i1 [[TMP28]], label %[[MIDDLE_LOOP3]], label %[[OUTER_LATCH_LOOPEXIT17]]
+; CHECK-IC1: [[OUTER_LATCH_LOOPEXIT17]]:
+; CHECK-IC1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC1-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-IC1-NEXT: [[TMP23:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC1-NEXT: br i1 [[TMP23]], label %[[VECTOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-IC1: [[EXIT]]:
+; CHECK-IC1-NEXT: ret void
+;
+; CHECK-IC2-LABEL: define void @quuz(
+; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], i64 [[L:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] {
+; CHECK-IC2-NEXT: [[ENTRY:.*:]]
+; CHECK-IC2-NEXT: [[N_IS_ZERO:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-IC2-NEXT: br i1 [[N_IS_ZERO]], label %[[EXIT:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-IC2: [[VECTOR_PH]]:
+; CHECK-IC2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-IC2-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 3
+; CHECK-IC2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 3
+; CHECK-IC2-NEXT: [[TMP5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
+; CHECK-IC2-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 2
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 [[N]])
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[M]], i64 0
+; CHECK-IC2-NEXT: [[TMP25:%.*]] = icmp eq <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], zeroinitializer
+; CHECK-IC2-NEXT: [[TMP9:%.*]] = shufflevector <vscale x 4 x i1> [[TMP25]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP10:%.*]] = xor <vscale x 4 x i1> [[TMP9]], splat (i1 true)
+; CHECK-IC2-NEXT: [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP1]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[L]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-IC2: [[VECTOR_BODY]]:
+; CHECK-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT35:.*]] ]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT35]] ]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], %[[OUTER_LATCH_LOOPEXIT35]] ]
+; CHECK-IC2-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP11]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT35]] ]
+; CHECK-IC2-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; CHECK-IC2-NEXT: [[B_INV_GEP:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC2-NEXT: [[TMP28:%.*]] = mul <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-IC2-NEXT: [[TMP14:%.*]] = mul <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT6]]
+; CHECK-IC2-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[B]], <vscale x 4 x i64> [[TMP28]]
+; CHECK-IC2-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[B]], <vscale x 4 x i64> [[TMP14]]
+; CHECK-IC2-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT: br label %[[MIDDLE_LOOP7:.*]]
+; CHECK-IC2: [[MIDDLE_LOOP7]]:
+; CHECK-IC2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT38:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26:.*]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI8:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT40:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI9:%.*]] = phi <vscale x 4 x i1> [ [[TMP17]], %[[VECTOR_BODY]] ], [ [[TMP57:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI10:%.*]] = phi <vscale x 4 x i1> [ [[TMP18]], %[[VECTOR_BODY]] ], [ [[TMP58:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26]] ]
+; CHECK-IC2-NEXT: [[TMP19:%.*]] = icmp ne <vscale x 4 x i64> [[BROADCAST_SPLAT6]], zeroinitializer
+; CHECK-IC2-NEXT: [[TMP20:%.*]] = icmp ne <vscale x 4 x i64> [[BROADCAST_SPLAT6]], zeroinitializer
+; CHECK-IC2-NEXT: [[TMP21:%.*]] = select <vscale x 4 x i1> [[VEC_PHI9]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP22:%.*]] = select <vscale x 4 x i1> [[VEC_PHI10]], <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT: br label %[[INNER_LOOP11:.*]]
+; CHECK-IC2: [[INNER_LOOP11]]:
+; CHECK-IC2-NEXT: [[VEC_PHI12:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[MIDDLE_LOOP7]] ], [ [[BROADCAST_SPLAT42:%.*]], %[[INNER_LOOP11]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI13:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[MIDDLE_LOOP7]] ], [ [[BROADCAST_SPLAT44:%.*]], %[[INNER_LOOP11]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI14:%.*]] = phi <vscale x 4 x i1> [ [[TMP21]], %[[MIDDLE_LOOP7]] ], [ [[TMP64:%.*]], %[[INNER_LOOP11]] ]
+; CHECK-IC2-NEXT: [[VEC_PHI15:%.*]] = phi <vscale x 4 x i1> [ [[TMP22]], %[[MIDDLE_LOOP7]] ], [ [[TMP43:%.*]], %[[INNER_LOOP11]] ]
+; CHECK-IC2-NEXT: [[TMP23:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI12]], i64 0
+; CHECK-IC2-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], [[N]]
+; CHECK-IC2-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[B_INV_GEP]], i64 [[TMP24]]
+; CHECK-IC2-NEXT: [[TMP26:%.*]] = getelementptr float, <vscale x 4 x ptr> [[TMP15]], <vscale x 4 x i64> [[VEC_PHI12]]
+; CHECK-IC2-NEXT: [[TMP27:%.*]] = getelementptr float, <vscale x 4 x ptr> [[TMP16]], <vscale x 4 x i64> [[VEC_PHI13]]
+; CHECK-IC2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[DOTIDX7:%.*]] = shl i64 [[TMP13]], 4
+; CHECK-IC2-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[DOTIDX7]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[VEC_PHI14]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[VEC_PHI15]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP26]], i32 4, <vscale x 4 x i1> [[VEC_PHI14]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP27]], i32 4, <vscale x 4 x i1> [[VEC_PHI15]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[TMP30:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-IC2-NEXT: [[TMP31:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_GATHER13]]
+; CHECK-IC2-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[DOTIDX21:%.*]] = shl i64 [[TMP32]], 4
+; CHECK-IC2-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[DOTIDX21]]
+; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP30]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[VEC_PHI14]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP31]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[VEC_PHI15]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT: [[TMP34:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI12]], i64 0
+; CHECK-IC2-NEXT: [[TMP35:%.*]] = add i64 [[TMP34]], 1
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT41:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP35]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT42]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT41]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP36:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI13]], i64 0
+; CHECK-IC2-NEXT: [[TMP37:%.*]] = add i64 [[TMP36]], 1
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT43:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT44]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT43]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[TMP35]], [[L]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP38]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT18]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP65:%.*]] = icmp eq i64 [[TMP37]], [[L]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP65]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT20]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP66:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT19]], splat (i1 true)
+; CHECK-IC2-NEXT: [[TMP67:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT21]], splat (i1 true)
+; CHECK-IC2-NEXT: [[TMP64]] = select <vscale x 4 x i1> [[VEC_PHI14]], <vscale x 4 x i1> [[TMP66]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP43]] = select <vscale x 4 x i1> [[VEC_PHI15]], <vscale x 4 x i1> [[TMP67]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP44:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP64]])
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT22:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP44]], i64 0
+; CHECK-IC2-NEXT: [[TMP45:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP43]])
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT24:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP45]], i64 0
+; CHECK-IC2-NEXT: [[TMP46:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLATINSERT22]], [[BROADCAST_SPLATINSERT24]]
+; CHECK-IC2-NEXT: [[TMP47:%.*]] = shufflevector <vscale x 4 x i1> [[TMP46]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP48:%.*]] = extractelement <vscale x 4 x i1> [[TMP47]], i64 0
+; CHECK-IC2-NEXT: br i1 [[TMP48]], label %[[INNER_LOOP11]], label %[[MIDDLE_LATCH_LOOPEXIT26]]
+; CHECK-IC2: [[MIDDLE_LATCH_LOOPEXIT26]]:
+; CHECK-IC2-NEXT: [[TMP49:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT: [[TMP50:%.*]] = add nuw nsw i64 [[TMP49]], 1
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT37:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP50]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT38]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT37]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP51:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI8]], i64 0
+; CHECK-IC2-NEXT: [[TMP52:%.*]] = add nuw nsw i64 [[TMP51]], 1
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT39:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP52]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT40]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT39]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP53:%.*]] = icmp eq i64 [[TMP50]], [[M]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP53]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT27]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP54:%.*]] = icmp eq i64 [[TMP52]], [[M]]
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP54]], i64 0
+; CHECK-IC2-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT29]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP55:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT28]], splat (i1 true)
+; CHECK-IC2-NEXT: [[TMP56:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT30]], splat (i1 true)
+; CHECK-IC2-NEXT: [[TMP57]] = select <vscale x 4 x i1> [[VEC_PHI9]], <vscale x 4 x i1> [[TMP55]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP58]] = select <vscale x 4 x i1> [[VEC_PHI10]], <vscale x 4 x i1> [[TMP56]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP59:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP57]])
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT31:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP59]], i64 0
+; CHECK-IC2-NEXT: [[TMP60:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP58]])
+; CHECK-IC2-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP60]], i64 0
+; CHECK-IC2-NEXT: [[TMP61:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLATINSERT31]], [[BROADCAST_SPLATINSERT33]]
+; CHECK-IC2-NEXT: [[TMP62:%.*]] = shufflevector <vscale x 4 x i1> [[TMP61]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT: [[TMP63:%.*]] = extractelement <vscale x 4 x i1> [[TMP62]], i64 0
+; CHECK-IC2-NEXT: br i1 [[TMP63]], label %[[MIDDLE_LOOP7]], label %[[OUTER_LATCH_LOOPEXIT35]]
+; CHECK-IC2: [[OUTER_LATCH_LOOPEXIT35]]:
+; CHECK-IC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-IC2-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT: [[TMP40:%.*]] = shl i64 [[TMP39]], 2
+; CHECK-IC2-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]]
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP5]])
+; CHECK-IC2-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP41]], i64 [[TMP5]])
+; CHECK-IC2-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT4]]
+; CHECK-IC2-NEXT: [[TMP42:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC2-NEXT: br i1 [[TMP42]], label %[[VECTOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-IC2: [[EXIT]]:
+; CHECK-IC2-NEXT: ret void
+;
+entry:
+ %N.is.zero = icmp eq i64 %N, 0
+ br i1 %N.is.zero, label %exit, label %outer.loop
+
+outer.loop:
+ %i = phi i64 [ %i.next, %outer.latch ], [ 0, %entry ]
+ %a.inv.gep = getelementptr float, ptr %A, i64 %i
+ %i.x.L = mul i64 %i, %L
+ %b.inv.gep = getelementptr float, ptr %B, i64 %i.x.L
+ %M.is.zero = icmp eq i64 %M, 0
+ br i1 %M.is.zero, label %outer.latch, label %middle.loop
+
+middle.loop:
+ %j = phi i64 [ %j.next, %middle.latch ], [ 0, %outer.loop ]
+ %L.is.zero = icmp eq i64 %L, 0
+ br i1 %L.is.zero, label %middle.latch, label %inner.loop
+
+inner.loop:
+ %k = phi i64 [ %k.next, %inner.loop ], [ 0, %middle.loop ]
+ %k.x.N = mul i64 %k, %N
+ %a.gep = getelementptr float, ptr %a.inv.gep, i64 %k.x.N
+ %b.gep = getelementptr float, ptr %b.inv.gep, i64 %k
+ %a.load = load float, ptr %a.gep, align 4, !llvm.access.group !3
+ %b.load = load float, ptr %b.gep, align 4, !llvm.access.group !3
+ %res = fadd float %a.load, %b.load
+ store float %res, ptr %a.gep, align 4, !llvm.access.group !3
+ %k.next = add nuw nsw i64 %k, 1
+ %inner.exitcond = icmp eq i64 %k.next, %L
+ br i1 %inner.exitcond, label %middle.latch, label %inner.loop
+
+middle.latch:
+ %j.next = add nuw nsw i64 %j, 1
+ %middle.exitcond = icmp eq i64 %j.next, %M
+ br i1 %middle.exitcond, label %outer.latch, label %middle.loop
+
+outer.latch:
+ %i.next = add nuw nsw i64 %i, 1
+ %outer.exitcond = icmp eq i64 %i.next, %N
+ br i1 %outer.exitcond, label %exit, label %outer.loop, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = !{!"llvm.loop.parallel_accesses", !3}
+!3 = distinct !{}
+;.
+; CHECK-IC1: [[ACC_GRP0]] = distinct !{}
+; CHECK-IC1: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-IC1: [[META2]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP0]]}
+; CHECK-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META3]], [[META4]]}
+; CHECK-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]], [[META4]]}
+; CHECK-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META3]], [[META4]]}
+;.
+; CHECK-IC2: [[ACC_GRP0]] = distinct !{}
+; CHECK-IC2: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-IC2: [[META2]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP0]]}
+; CHECK-IC2: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-IC2: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-IC2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META3]], [[META4]]}
+; CHECK-IC2: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]], [[META4]]}
+; CHECK-IC2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META3]], [[META4]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll
new file mode 100644
index 0000000000000..46b7bf6f4c7b3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll
@@ -0,0 +1,647 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=loop-vectorize,instcombine,simplifycfg -force-vector-width=4 -force-vector-interleave=1 -experimental-olv-in-classic-vect < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+;;; Effectively the inner two loops of:
+; for (size_t i = 0; i < N; i++) {
+; #pragma clang loop vectorize(enable)
+; for (size_t j = 0; j < N; j++) {
+; float a = 0.;
+; for (size_t k = 0; k < N; k++)
+; a += B[i][k] * C[k][j];
+; A[i][j] = a;
+; }
+; }
+define void @foo(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH9:.*]] ]
+; CHECK-NEXT: br label %[[INNER_LOOP1:.*]]
+; CHECK: [[INNER_LOOP1]]:
+; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT12:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP6:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i1> [ splat (i1 true), %[[VECTOR_BODY]] ], [ [[TMP15:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x float> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP7]], [[M]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP6]] = fadd <4 x float> [[VEC_PHI]], [[TMP5]]
+; CHECK-NEXT: [[TMP9]] = select <4 x i1> [[VEC_PHI3]], <4 x float> [[TMP6]], <4 x float> [[VEC_PHI4]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[TMP19]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP21]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT12]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT11]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[TMP21]], [[M]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i1> poison, i1 [[TMP12]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT5]], <i1 true, i1 poison, i1 poison, i1 poison>
+; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP15]] = select <4 x i1> [[VEC_PHI3]], <4 x i1> [[TMP22]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i1> [[TMP15]] to i4
+; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP16]], 0
+; CHECK-NEXT: br i1 [[DOTNOT]], label %[[LOOP_LATCH9]], label %[[INNER_LOOP1]]
+; CHECK: [[LOOP_LATCH9]]:
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: store <4 x float> [[TMP9]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: br label %[[INNER_LOOP:.*]]
+; CHECK: [[INNER_LOOP]]:
+; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ]
+; CHECK-NEXT: [[A_PHI:%.*]] = phi float [ [[A_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0.000000e+00, %[[LOOP_HEADER]] ]
+; CHECK-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[J]]
+; CHECK-NEXT: [[B_LOAD:%.*]] = load float, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[JXM:%.*]] = mul i64 [[J]], [[M]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[C]], i64 [[JXM]]
+; CHECK-NEXT: [[C_ADDR:%.*]] = getelementptr float, ptr [[TMP11]], i64 [[I]]
+; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[MUL:%.*]] = fmul float [[B_LOAD]], [[C_LOAD]]
+; CHECK-NEXT: [[A_NEXT]] = fadd float [[A_PHI]], [[MUL]]
+; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT: [[INNER_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], [[M]]
+; CHECK-NEXT: br i1 [[INNER_EXITCOND]], label %[[LOOP_LATCH]], label %[[INNER_LOOP]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]]
+; CHECK-NEXT: store float [[A_NEXT]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT: [[LOOP_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[LOOP_EXITCOND]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+ br label %inner.loop
+
+inner.loop:
+ %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+ %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+ %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+ %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3
+ %jxM = mul i64 %j, %M
+ %jxMpi = add i64 %jxM, %i
+ %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+ %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3
+ %mul = fmul float %b.load, %c.load
+ %a.next = fadd float %a.phi, %mul
+ %j.next = add nuw nsw i64 %j, 1
+ %inner.exitcond = icmp eq i64 %j.next, %M
+ br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+ %a.lcssa = phi float [ %a.next, %inner.loop ]
+ %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+ store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3
+ %i.next = add nuw nsw i64 %i, 1
+ %loop.exitcond = icmp eq i64 %i.next, %N
+ br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+;;; Effectively the inner two loops of:
+; for (size_t i = 0; i < N; i++) {
+; #pragma clang loop vectorize(enable)
+; for (size_t j = 0; j < N; j++) {
+; float a = 0.;
+; for (size_t k = 0; k < j; k++)
+; a += B[i][k] * C[k][j];
+; A[i][j] = a;
+; }
+; }
+;;; Note that the inner loop's trip-count depends on the outer loop.
+define void @bar(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) {
+; CHECK-LABEL: define void @bar(
+; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_VEC1:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[M]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[N_VEC:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[LOOP_LATCH3:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[LOOP_LATCH3]] ]
+; CHECK-NEXT: br label %[[INNER_LOOP1:.*]]
+; CHECK: [[INNER_LOOP1]]:
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT6:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP44:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP5]], [[VEC_IND]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i64 1
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i64 2
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i64 3
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP12]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP14]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP15]], i64 0
+; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP16]], i64 1
+; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i64 2
+; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i64 3
+; CHECK-NEXT: [[TMP43:%.*]] = fmul <4 x float> [[TMP4]], [[TMP41]]
+; CHECK-NEXT: [[TMP44]] = fadd <4 x float> [[VEC_PHI3]], [[TMP43]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0
+; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[TMP25]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX_NEXT]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP51]], label %[[LOOP_LATCH3]], label %[[INNER_LOOP1]]
+; CHECK: [[LOOP_LATCH3]]:
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[N_VEC]]
+; CHECK-NEXT: store <4 x float> [[TMP44]], ptr [[TMP28]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[N_VEC]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
+; CHECK-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: br label %[[INNER_LOOP:.*]]
+; CHECK: [[INNER_LOOP]]:
+; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ]
+; CHECK-NEXT: [[A_PHI:%.*]] = phi float [ [[A_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0.000000e+00, %[[LOOP_HEADER]] ]
+; CHECK-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[J]]
+; CHECK-NEXT: [[B_LOAD:%.*]] = load float, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[JXM:%.*]] = mul i64 [[J]], [[M]]
+; CHECK-NEXT: [[TMP52:%.*]] = getelementptr float, ptr [[C]], i64 [[JXM]]
+; CHECK-NEXT: [[C_ADDR:%.*]] = getelementptr float, ptr [[TMP52]], i64 [[I]]
+; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[MUL:%.*]] = fmul float [[B_LOAD]], [[C_LOAD]]
+; CHECK-NEXT: [[A_NEXT]] = fadd float [[A_PHI]], [[MUL]]
+; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT: [[INNER_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], [[I]]
+; CHECK-NEXT: br i1 [[INNER_EXITCOND]], label %[[LOOP_LATCH]], label %[[INNER_LOOP]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]]
+; CHECK-NEXT: store float [[A_NEXT]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT: [[LOOP_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[LOOP_EXITCOND]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+ br label %inner.loop
+
+inner.loop:
+ %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+ %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+ %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+ %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3
+ %jxM = mul i64 %j, %M
+ %jxMpi = add i64 %jxM, %i
+ %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+ %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3
+ %mul = fmul float %b.load, %c.load
+ %a.next = fadd float %a.phi, %mul
+ %j.next = add nuw nsw i64 %j, 1
+ %inner.exitcond = icmp eq i64 %j.next, %i
+ br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+ %a.lcssa = phi float [ %a.next, %inner.loop ]
+ %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+ store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3
+ %i.next = add nuw nsw i64 %i, 1
+ %loop.exitcond = icmp eq i64 %i.next, %N
+ br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+;;; Effectively something like:
+; #pragma clang loop vectorize(enable)
+; for (long i = 0; i < N; i++) {
+; long a = A[i];
+; long j = 0;
+; if (a > 0) {
+; do {
+; a -= B[j];
+; j++;
+; } while (a > 0);
+; }
+; A[i] = a + j;
+; }
+;;; Note that the inner loop is behind a branch, so the start value of the inner
+;;; loop mask phi must be corespondingly. The induction of the inner loop is used
+;;; for a uniform memory accesses and as live-out, so the vectorized code should
+;;; contain two phis for it (one scalar and one widened).
+;;; Also, in this example, the inner loop backedge is the first successor of the
+;;; the latch terminator, not the second one as is assumed by VPlan.
+define void @baz(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B) {
+; CHECK-LABEL: define void @baz(
+; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT9:.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: br label %[[INNER_LOOP1:.*]]
+; CHECK: [[INNER_LOOP1]]:
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP20:%.*]], %[[PRED_LOAD_CONTINUE8]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0
+; CHECK-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0
+; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK: [[PRED_LOAD_IF]]:
+; CHECK-NEXT: [[A_LOAD:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[A_LOAD]], i64 0
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]]
+; CHECK: [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ poison, %[[INNER_LOOP1]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1
+; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK: [[PRED_LOAD_IF3]]:
+; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[TMP9]], i64 1
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK: [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i64> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2
+; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; CHECK: [[PRED_LOAD_IF5]]:
+; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[TMP13]], i64 2
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK: [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i64> [ [[TMP11]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP14]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3
+; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]]
+; CHECK: [[PRED_LOAD_IF7]]:
+; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP17]], i64 3
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]]
+; CHECK: [[PRED_LOAD_CONTINUE8]]:
+; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i64> [ [[TMP15]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP18]], %[[PRED_LOAD_IF7]] ]
+; CHECK-NEXT: [[TMP20]] = sub <4 x i64> [[VEC_PHI2]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[TMP21]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP22]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
+; CHECK-NEXT: [[TMP24:%.*]] = icmp slt i64 [[TMP23]], 1
+; CHECK-NEXT: br i1 [[TMP24]], label %[[LOOP_LATCH_LOOPEXIT9]], label %[[INNER_LOOP1]]
+; CHECK: [[LOOP_LATCH_LOOPEXIT9]]:
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[TMP20]], <4 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[A_ADDR1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]]
+; CHECK-NEXT: [[A_LOAD1:%.*]] = load i64, ptr [[A_ADDR1]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[A_IS_POSITIVE:%.*]] = icmp sgt i64 [[A_LOAD1]], 0
+; CHECK-NEXT: br i1 [[A_IS_POSITIVE]], label %[[INNER_LOOP:.*]], label %[[LOOP_LATCH]]
+; CHECK: [[INNER_LOOP]]:
+; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ]
+; CHECK-NEXT: [[A_PHI:%.*]] = phi i64 [ [[A_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ]
+; CHECK-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds nuw i64, ptr [[B]], i64 [[J]]
+; CHECK-NEXT: [[B_LOAD:%.*]] = load i64, ptr [[B_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[A_NEXT]] = sub i64 [[A_PHI]], [[B_LOAD]]
+; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT: [[A_IS_STILL_POSITIVE:%.*]] = icmp sgt i64 [[A_NEXT]], 0
+; CHECK-NEXT: br i1 [[A_IS_STILL_POSITIVE]], label %[[INNER_LOOP]], label %[[LOOP_LATCH]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[A_RES:%.*]] = phi i64 [ [[A_LOAD1]], %[[LOOP_HEADER]] ], [ [[A_NEXT]], %[[INNER_LOOP]] ]
+; CHECK-NEXT: store i64 [[A_RES]], ptr [[A_ADDR1]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT: [[LOOP_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[LOOP_EXITCOND]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+ %a.addr = getelementptr inbounds i64, ptr %A, i64 %i
+ %a.load = load i64, ptr %a.addr, align 8, !llvm.access.group !3
+ %a.is.positive = icmp sgt i64 %a.load, 0
+ br i1 %a.is.positive, label %inner.loop, label %loop.latch
+
+inner.loop:
+ %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+ %a.phi = phi i64 [ %a.next, %inner.loop ], [ 0, %loop.header ]
+ %b.addr = getelementptr inbounds i64, ptr %B, i64 %j
+ %b.load = load i64, ptr %b.addr, align 8, !llvm.access.group !3
+ %a.next = sub i64 %a.phi, %b.load
+ %j.next = add nuw nsw i64 %j, 1
+ %a.is.still.positive = icmp sgt i64 %a.next, 0
+ br i1 %a.is.still.positive, label %inner.loop, label %loop.latch
+
+loop.latch:
+ %a.res = phi i64 [ %a.load, %loop.header ], [ %a.next, %inner.loop ]
+ store i64 %a.res, ptr %a.addr, align 8, !llvm.access.group !3
+ %i.next = add nuw nsw i64 %i, 1
+ %loop.exitcond = icmp eq i64 %i.next, %N
+ br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+;;; Triple-loop nest with the outer-most one beeing vectorized.
+; #pragma clang loop vectorize(enable)
+; for (size_t i = 0; i < N; i++)
+; for (size_t j = 0; j < M; j++)
+; for (size_t k = 0; k < L; k++)
+; A[k][i] += B[i][k];
+define void @quuz(i64 %N, i64 %M, i64 %L, ptr noalias %A, ptr readonly %B) {
+; CHECK-LABEL: define void @quuz(
+; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], i64 [[L:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[N_IS_ZERO:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-NEXT: br i1 [[N_IS_ZERO]], label %[[EXIT:.*]], label %[[OUTER_LOOP_PREHEADER:.*]]
+; CHECK: [[OUTER_LOOP_PREHEADER]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[M]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLATINSERT]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[L]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[TMP28:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT25:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT25]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: br label %[[MIDDLE_LOOP3:.*]]
+; CHECK: [[MIDDLE_LOOP3]]:
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT27:%.*]], %[[MIDDLE_LATCH_LOOPEXIT20:.*]] ]
+; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ [[TMP7]], %[[VECTOR_BODY]] ], [ [[TMP65:%.*]], %[[MIDDLE_LATCH_LOOPEXIT20]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[VEC_PHI4]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
+; CHECK-NEXT: br label %[[INNER_LOOP5:.*]]
+; CHECK: [[INNER_LOOP5]]:
+; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ zeroinitializer, %[[MIDDLE_LOOP3]] ], [ [[BROADCAST_SPLAT29:%.*]], %[[PRED_STORE_CONTINUE15:.*]] ]
+; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i1> [ [[TMP4]], %[[MIDDLE_LOOP3]] ], [ [[TMP58:%.*]], %[[PRED_STORE_CONTINUE15]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[VEC_PHI6]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 0
+; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP28]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i64 0
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP29]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP32]]
+; CHECK-NEXT: [[INDEX:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr [[TMP33]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP16:%.*]] = fadd float [[TMP10]], [[TMP15]]
+; CHECK-NEXT: store float [[TMP16]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 1
+; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]]
+; CHECK: [[PRED_STORE_IF10]]:
+; CHECK-NEXT: [[TMP18:%.*]] = or disjoint i64 [[TMP28]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP5]], i64 1
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr float, ptr [[TMP19]], i64 [[TMP20]]
+; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 1
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr float, ptr [[TMP24]], i64 [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP67:%.*]] = fadd float [[TMP22]], [[TMP27]]
+; CHECK-NEXT: store float [[TMP67]], ptr [[TMP21]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE11]]
+; CHECK: [[PRED_STORE_CONTINUE11]]:
+; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 2
+; CHECK-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
+; CHECK: [[PRED_STORE_IF12]]:
+; CHECK-NEXT: [[TMP30:%.*]] = or disjoint i64 [[TMP28]], 2
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP30]]
+; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i64> [[TMP5]], i64 2
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr float, ptr [[TMP31]], i64 [[TMP69]]
+; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[TMP70]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP35]]
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 2
+; CHECK-NEXT: [[TMP38:%.*]] = getelementptr float, ptr [[TMP36]], i64 [[TMP37]]
+; CHECK-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP40:%.*]] = fadd float [[TMP34]], [[TMP39]]
+; CHECK-NEXT: store float [[TMP40]], ptr [[TMP70]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE13]]
+; CHECK: [[PRED_STORE_CONTINUE13]]:
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 3
+; CHECK-NEXT: br i1 [[TMP41]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15]]
+; CHECK: [[PRED_STORE_IF14]]:
+; CHECK-NEXT: [[TMP42:%.*]] = or disjoint i64 [[TMP28]], 3
+; CHECK-NEXT: [[TMP43:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP42]]
+; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i64> [[TMP5]], i64 3
+; CHECK-NEXT: [[TMP45:%.*]] = getelementptr float, ptr [[TMP43]], i64 [[TMP44]]
+; CHECK-NEXT: [[TMP46:%.*]] = load float, ptr [[TMP45]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
+; CHECK-NEXT: [[TMP48:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP47]]
+; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 3
+; CHECK-NEXT: [[TMP50:%.*]] = getelementptr float, ptr [[TMP48]], i64 [[TMP49]]
+; CHECK-NEXT: [[TMP51:%.*]] = load float, ptr [[TMP50]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[TMP52:%.*]] = fadd float [[TMP46]], [[TMP51]]
+; CHECK-NEXT: store float [[TMP52]], ptr [[TMP45]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE15]]
+; CHECK: [[PRED_STORE_CONTINUE15]]:
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 0
+; CHECK-NEXT: [[TMP54:%.*]] = add nuw nsw i64 [[TMP71]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i64> poison, i64 [[TMP54]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT29]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT28]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP55:%.*]] = icmp eq i64 [[TMP54]], [[L]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <4 x i1> poison, i1 [[TMP55]], i64 0
+; CHECK-NEXT: [[TMP56:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT16]], <i1 true, i1 poison, i1 poison, i1 poison>
+; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <4 x i1> [[TMP56]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP58]] = select <4 x i1> [[VEC_PHI7]], <4 x i1> [[TMP57]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP59:%.*]] = bitcast <4 x i1> [[TMP58]] to i4
+; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP59]], 0
+; CHECK-NEXT: br i1 [[DOTNOT]], label %[[MIDDLE_LATCH_LOOPEXIT20]], label %[[INNER_LOOP5]]
+; CHECK: [[MIDDLE_LATCH_LOOPEXIT20]]:
+; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0
+; CHECK-NEXT: [[TMP61:%.*]] = add nuw nsw i64 [[TMP60]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <4 x i64> poison, i64 [[TMP61]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT27]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT26]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP62:%.*]] = icmp eq i64 [[TMP61]], [[M]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i1> poison, i1 [[TMP62]], i64 0
+; CHECK-NEXT: [[TMP63:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT21]], <i1 true, i1 poison, i1 poison, i1 poison>
+; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <4 x i1> [[TMP63]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP65]] = select <4 x i1> [[VEC_PHI4]], <4 x i1> [[TMP64]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP66:%.*]] = bitcast <4 x i1> [[TMP65]] to i4
+; CHECK-NEXT: [[DOTNOT30:%.*]] = icmp eq i4 [[TMP66]], 0
+; CHECK-NEXT: br i1 [[DOTNOT30]], label %[[OUTER_LATCH_LOOPEXIT25]], label %[[MIDDLE_LOOP3]]
+; CHECK: [[OUTER_LATCH_LOOPEXIT25]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP28]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP53]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP_PREHEADER]] ]
+; CHECK-NEXT: br label %[[OUTER_LOOP:.*]]
+; CHECK: [[OUTER_LOOP]]:
+; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[OUTER_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[A_INV_GEP:%.*]] = getelementptr float, ptr [[A]], i64 [[I]]
+; CHECK-NEXT: [[I_X_L:%.*]] = mul i64 [[I]], [[L]]
+; CHECK-NEXT: [[B_INV_GEP:%.*]] = getelementptr float, ptr [[B]], i64 [[I_X_L]]
+; CHECK-NEXT: [[M_IS_ZERO:%.*]] = icmp eq i64 [[M]], 0
+; CHECK-NEXT: br i1 [[M_IS_ZERO]], label %[[OUTER_LATCH]], label %[[MIDDLE_LOOP:.*]]
+; CHECK: [[MIDDLE_LOOP]]:
+; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[MIDDLE_LATCH:.*]] ], [ 0, %[[OUTER_LOOP]] ]
+; CHECK-NEXT: [[L_IS_ZERO:%.*]] = icmp eq i64 [[L]], 0
+; CHECK-NEXT: br i1 [[L_IS_ZERO]], label %[[MIDDLE_LATCH]], label %[[INNER_LOOP:.*]]
+; CHECK: [[INNER_LOOP]]:
+; CHECK-NEXT: [[K:%.*]] = phi i64 [ [[K_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[MIDDLE_LOOP]] ]
+; CHECK-NEXT: [[K_X_N:%.*]] = mul i64 [[K]], [[N]]
+; CHECK-NEXT: [[A_GEP:%.*]] = getelementptr float, ptr [[A_INV_GEP]], i64 [[K_X_N]]
+; CHECK-NEXT: [[B_GEP:%.*]] = getelementptr float, ptr [[B_INV_GEP]], i64 [[K]]
+; CHECK-NEXT: [[A_LOAD:%.*]] = load float, ptr [[A_GEP]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[B_LOAD:%.*]] = load float, ptr [[B_GEP]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A_LOAD]], [[B_LOAD]]
+; CHECK-NEXT: store float [[RES]], ptr [[A_GEP]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT: [[K_NEXT]] = add nuw nsw i64 [[K]], 1
+; CHECK-NEXT: [[INNER_EXITCOND:%.*]] = icmp eq i64 [[K_NEXT]], [[L]]
+; CHECK-NEXT: br i1 [[INNER_EXITCOND]], label %[[MIDDLE_LATCH]], label %[[INNER_LOOP]]
+; CHECK: [[MIDDLE_LATCH]]:
+; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT: [[MIDDLE_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], [[M]]
+; CHECK-NEXT: br i1 [[MIDDLE_EXITCOND]], label %[[OUTER_LATCH]], label %[[MIDDLE_LOOP]]
+; CHECK: [[OUTER_LATCH]]:
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT: [[OUTER_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[OUTER_EXITCOND]], label %[[EXIT]], label %[[OUTER_LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ %N.is.zero = icmp eq i64 %N, 0
+ br i1 %N.is.zero, label %exit, label %outer.loop
+
+outer.loop:
+ %i = phi i64 [ %i.next, %outer.latch ], [ 0, %entry ]
+ %a.inv.gep = getelementptr float, ptr %A, i64 %i
+ %i.x.L = mul i64 %i, %L
+ %b.inv.gep = getelementptr float, ptr %B, i64 %i.x.L
+ %M.is.zero = icmp eq i64 %M, 0
+ br i1 %M.is.zero, label %outer.latch, label %middle.loop
+
+middle.loop:
+ %j = phi i64 [ %j.next, %middle.latch ], [ 0, %outer.loop ]
+ %L.is.zero = icmp eq i64 %L, 0
+ br i1 %L.is.zero, label %middle.latch, label %inner.loop
+
+inner.loop:
+ %k = phi i64 [ %k.next, %inner.loop ], [ 0, %middle.loop ]
+ %k.x.N = mul i64 %k, %N
+ %a.gep = getelementptr float, ptr %a.inv.gep, i64 %k.x.N
+ %b.gep = getelementptr float, ptr %b.inv.gep, i64 %k
+ %a.load = load float, ptr %a.gep, align 4, !llvm.access.group !3
+ %b.load = load float, ptr %b.gep, align 4, !llvm.access.group !3
+ %res = fadd float %a.load, %b.load
+ store float %res, ptr %a.gep, align 4, !llvm.access.group !3
+ %k.next = add nuw nsw i64 %k, 1
+ %inner.exitcond = icmp eq i64 %k.next, %L
+ br i1 %inner.exitcond, label %middle.latch, label %inner.loop
+
+middle.latch:
+ %j.next = add nuw nsw i64 %j, 1
+ %middle.exitcond = icmp eq i64 %j.next, %M
+ br i1 %middle.exitcond, label %outer.latch, label %middle.loop
+
+outer.latch:
+ %i.next = add nuw nsw i64 %i, 1
+ %outer.exitcond = icmp eq i64 %i.next, %N
+ br i1 %outer.exitcond, label %exit, label %outer.loop, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = !{!"llvm.loop.parallel_accesses", !3}
+!3 = distinct !{}
+;.
+; CHECK: [[ACC_GRP0]] = distinct !{}
+; CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK: [[META2]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP0]]}
+; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META4]], [[META3]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]], [[META4]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META4]], [[META3]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META3]], [[META4]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META4]], [[META3]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META3]], [[META4]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META4]], [[META3]]}
+;.
More information about the llvm-commits
mailing list