[llvm] Draft: [LV] Outer-loop vectorization in the default vectorizer codepath (PR #128202)

via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 21 09:01:03 PST 2025


https://github.com/iamlouk created https://github.com/llvm/llvm-project/pull/128202

This is a draft MR to get feedback if something like this would be considered
a good-enough approach by current maintainers to merge into LLVM. I would split
it into smaller pieces if the general direction is not conflicting with current plans.
It implements outer-loop vectorization *outside* the VPlan-native path. Minimal
LoopAccessAnalysis support for non-innermost loops was added relying on the
`!llvm.loop.parallel_accesses` metadata.

Unlike for the VPlan-native path, inner loops with non-invariant trip-counts or
non-uniform inductions are supported, and the quality of the emitted code is better
than that of the current VPlan-native path.

A implementation very close to this one (#124432 required some changes but
also simplified this MR a lot) was successfully tested in combination with
basic LAA MemoryDepChecker support (not part of this MR) for outer loops on the
llvm-test-suite and SPEC (~3000 loops, outer-loop vect. was forced).

As a real-world motivational example, [this loop](https://github.com/HydroBench/Hydro/blob/6fa22ca83df6b355abf1eba42a9de6a24346b48e/HydroC/HydroCplusMPI/Tile.cpp#L1095)
can be looked at. Performance is more than doubled when outer-loop vectorizing it.

Some code for the VPWidenPHIRecipes is duplicated from #128187.

>From 72dccc1b5ded92345de8b63048f2995c223b29dc Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Thu, 20 Feb 2025 20:56:40 +0100
Subject: [PATCH 1/5] [VPlan] Update entry/exiting blocks in VPRegionBlocks

---
 llvm/lib/Transforms/Vectorize/VPlanUtils.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 6ddb88308955f..fd197fc8add2e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -96,6 +96,9 @@ class VPBlockUtils {
       connectBlocks(NewBlock, Succ);
     }
     connectBlocks(BlockPtr, NewBlock);
+    VPRegionBlock *Parent = BlockPtr->getParent();
+    if (Parent && Parent->getExiting() == BlockPtr)
+      Parent->setExiting(NewBlock);
   }
 
   /// Insert disconnected block \p NewBlock before \p Blockptr. First
@@ -112,6 +115,9 @@ class VPBlockUtils {
       connectBlocks(Pred, NewBlock);
     }
     connectBlocks(NewBlock, BlockPtr);
+    VPRegionBlock *Parent = BlockPtr->getParent();
+    if (Parent && Parent->getEntry() == BlockPtr)
+      Parent->setEntry(NewBlock);
   }
 
   /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p

>From 3819995b4647718b3b66ff941d40e9a3184f6bef Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Thu, 20 Feb 2025 21:03:45 +0100
Subject: [PATCH 2/5] [VPlan] Cloning and unrolling for VPWidenPHIRecipe

---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  6 +++-
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 31 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8089cfd1ce802..15e90bc18bc87 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1974,7 +1974,11 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe {
   }
 
   VPWidenPHIRecipe *clone() override {
-    llvm_unreachable("cloning not implemented yet");
+    auto *Phi = new VPWidenPHIRecipe(
+        dyn_cast_if_present<PHINode>(getUnderlyingValue()));
+    for (unsigned I = 0; I < getNumOperands(); I++)
+      Phi->addOperand(getIncomingValue(I));
+    return Phi;
   }
 
   ~VPWidenPHIRecipe() override = default;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 89e372d6b46cf..0b46e043e873d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -384,6 +384,21 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
       continue;
     }
 
+    // Handle inner-loop/region header phis. The backedge values will be set
+    // later. Phis not in a loop header can be unrolled like any other recipes,
+    // RPO makes sure the predecessors are all visited first.
+    VPRegionBlock *Region = R.getParent()->getParent();
+    if (auto *P = dyn_cast<VPWidenPHIRecipe>(&R);
+        P && Region->getEntryBasicBlock() == P->getParent()) {
+      auto InsertPt = std::next(R.getIterator());
+      for (unsigned Part = 1; Part != UF; ++Part) {
+        VPWidenPHIRecipe *Copy = P->clone();
+        Copy->insertBefore(*R.getParent(), InsertPt);
+        addRecipeForPart(&R, Copy, Part);
+      }
+      continue;
+    }
+
     unrollRecipeByUF(R);
   }
 }
@@ -442,5 +457,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
     Part++;
   }
 
+  // Remap operands of cloned inner-loop header phis to update backedge values,
+  // a problem unique to outer-loop vectorization.
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
+      DeepRPOT(Plan.getEntry());
+  for (VPRegionBlock *Region :
+       VPBlockUtils::blocksOnly<VPRegionBlock>(DeepRPOT))
+    for (VPRecipeBase &R : Region->getEntryBasicBlock()->phis())
+      if (auto *Phi = dyn_cast<VPWidenPHIRecipe>(&R)) {
+        if (Unroller.contains(Phi->getVPSingleValue())) {
+          Part = 1;
+          continue;
+        }
+        Unroller.remapOperands(&R, Part);
+        Part++;
+      }
+
   VPlanTransforms::removeDeadRecipes(Plan);
 }

>From 407d320aba89140c06e59f326847c5b6854a3359 Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Thu, 20 Feb 2025 21:04:35 +0100
Subject: [PATCH 3/5] [VPlan] Unrolling of VPInstruction::AnyOf

---
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 0b46e043e873d..2360a20d78cd5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -373,6 +373,28 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
       continue;
     }
 
+    if (auto *Any = dyn_cast<VPInstruction>(&R);
+        Any && Any->getOpcode() == VPInstruction::AnyOf) {
+      VPValue *Res = Any;
+      VPRecipeBase *FirstOr = nullptr;
+      for (unsigned Part = 1; Part != UF; ++Part) {
+        auto *NewAny = new VPInstruction(
+            VPInstruction::AnyOf, {getValueForPart(Any->getOperand(0), Part)},
+            Any->getDebugLoc());
+        NewAny->insertAfter(Res->getDefiningRecipe());
+        auto *Or = new VPInstruction(Instruction::Or, {Res, NewAny},
+                                     Any->getDebugLoc());
+        Or->insertAfter(NewAny->getDefiningRecipe());
+        ToSkip.insert(Or);
+        if (Part == 1)
+          FirstOr = Or;
+        Res = Or;
+      }
+      Any->getVPSingleValue()->replaceAllUsesWith(Res);
+      FirstOr->setOperand(0, Any);
+      continue;
+    }
+
     auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
     if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
       addUniformForAllParts(SingleDef);

>From 66556d57feadce2782b3498c52171d8fa564c48a Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Thu, 20 Feb 2025 21:06:00 +0100
Subject: [PATCH 4/5] [LAA] Basic initial outer-loop support

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |  69 ++++++++--
 .../LoopAccessAnalysis/outer-loops.ll         | 128 ++++++++++++++++++
 2 files changed, 186 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index a1d91de3bb788..6fe7a8a9eed69 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -792,21 +792,65 @@ class AccessAnalysis {
 
 } // end anonymous namespace
 
+/// Return true if \p E is invariant with regards to the Loop \p L.
+/// If \p E is a recurrence around a inner loop of \p L, then the
+/// start and step of that inner loop recurrence must be invariant
+/// to \p L.
+static bool isInvariantToTheLoop(const Loop *L, ScalarEvolution &SE,
+                                 const SCEV *E) {
+  if (SE.isLoopInvariant(E, L))
+    return true;
+
+  if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(E);
+      AddRec && L != AddRec->getLoop() && L->contains(AddRec->getLoop())) {
+    for (auto *Op : AddRec->operands())
+      if (!isInvariantToTheLoop(L, SE, Op))
+        return false;
+
+    return true;
+  }
+
+  return false;
+}
+
 /// Try to compute a constant stride for \p AR. Used by getPtrStride and
 /// isNoWrap.
 static std::optional<int64_t>
 getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
                     Value *Ptr, PredicatedScalarEvolution &PSE) {
-  // The access function must stride over the innermost loop.
+  // The access function must stride over the queried loop.
   if (Lp != AR->getLoop()) {
-    LLVM_DEBUG({
-      dbgs() << "LAA: Bad stride - Not striding over innermost loop ";
-      if (Ptr)
-        dbgs() << *Ptr << " ";
+    assert(!Lp->isInnermost() && Lp->contains(AR->getLoop()) &&
+           "Classic SE should have detected invariance");
+    while (AR && Lp != AR->getLoop()) {
+      if (isInvariantToTheLoop(Lp, *PSE.getSE(), AR))
+        return {0};
+
+      const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
+      if (!isInvariantToTheLoop(Lp, *PSE.getSE(), Step)) {
+        LLVM_DEBUG({
+          dbgs() << "LAA: Bad stride - Depends on inner loop ";
+          if (Ptr)
+            dbgs() << *Ptr << " ";
+
+          dbgs() << "SCEV: " << *AR << "\n";
+        });
+        return std::nullopt;
+      }
 
-      dbgs() << "SCEV: " << *AR << "\n";
-    });
-    return std::nullopt;
+      AR = dyn_cast<SCEVAddRecExpr>(AR->getStart());
+    }
+
+    if (!AR || Lp != AR->getLoop()) {
+      LLVM_DEBUG({
+        dbgs() << "LAA: Bad stride - Strides over inner loop ";
+        if (Ptr)
+          dbgs() << *Ptr << " ";
+
+        dbgs() << "SCEV: " << *AR << "\n";
+      });
+      return std::nullopt;
+    }
   }
 
   // Check the step is constant.
@@ -2365,8 +2409,9 @@ bool LoopAccessInfo::canAnalyzeLoop() {
                     << TheLoop->getHeader()->getParent()->getName() << "' from "
                     << TheLoop->getLocStr() << "\n");
 
-  // We can only analyze innermost loops.
-  if (!TheLoop->isInnermost()) {
+  // We can only analyze innermost loops if no memory dependency checks
+  // are needed.
+  if (!TheLoop->isInnermost() && !TheLoop->isAnnotatedParallel()) {
     LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
     recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
     return false;
@@ -2587,6 +2632,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     return true;
   }
 
+  assert(TheLoop->isInnermost());
+
   for (LoadInst *LD : Loads) {
     Value *Ptr = LD->getPointerOperand();
     // If we did *not* see this pointer before, insert it to the
@@ -2812,7 +2859,7 @@ bool LoopAccessInfo::isInvariant(Value *V) const {
   if (!SE->isSCEVable(V->getType()))
     return false;
   const SCEV *S = SE->getSCEV(V);
-  return SE->isLoopInvariant(S, TheLoop);
+  return isInvariantToTheLoop(TheLoop, *SE, S);
 }
 
 /// If \p Ptr is a GEP, which has a loop-variant operand, return that operand.
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll b/llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll
new file mode 100644
index 0000000000000..c71d821a7b0b6
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/outer-loops.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -disable-output -passes='print<access-info>' %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; The inner two loops of a naive matrix multiplication.
+; Not annotated as parallel, so the outer loop should not be analyzed.
+define void @outer_loop_not_parallel(i64 %N, i64 %M, ptr noalias %A, ptr %B, ptr %C) {
+; CHECK-LABEL: 'outer_loop_not_parallel'
+; CHECK-NEXT:    inner.loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Report: loop is not the innermost loop
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+  %M.is.zero = icmp eq i64 %M, 0
+  br i1 %M.is.zero, label %loop.latch, label %inner.loop
+
+inner.loop:
+  %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+  %a = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+  %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+  %b = load float, ptr %b.addr, align 4
+  %jxM = mul i64 %j, %M
+  %jxMpi = add i64 %jxM, %i
+  %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+  %c = load float, ptr %c.addr, align 4
+  %mul = fmul float %b, %c
+  %a.next = fadd float %a, %mul
+  %j.next = add nuw nsw i64 %j, 1
+  %inner.exitcond = icmp eq i64 %j.next, %M
+  br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+  %a.lcssa = phi float [ 0x0, %loop.header ], [ %a.next, %inner.loop ]
+  %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+  store float %a.lcssa, ptr %a.addr, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %loop.exitcond = icmp eq i64 %i.next, %N
+  br i1 %loop.exitcond, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+
+; The inner two loops of a naive matrix multiplication.
+; The outer loop is annotated as parallel.
+define void @outer_loop_parallel(i64 %N, i64 %M, ptr noalias %A, ptr %B, ptr %C) {
+; CHECK-LABEL: 'outer_loop_parallel'
+; CHECK-NEXT:    inner.loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+  %M.is.zero = icmp eq i64 %M, 0
+  br i1 %M.is.zero, label %loop.latch, label %inner.loop
+
+inner.loop:
+  %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+  %a = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+  %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+  %b = load float, ptr %b.addr, align 4, !llvm.access.group !1
+  %jxM = mul i64 %j, %M
+  %jxMpi = add i64 %jxM, %i
+  %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+  %c = load float, ptr %c.addr, align 4, !llvm.access.group !1
+  %mul = fmul float %b, %c
+  %a.next = fadd float %a, %mul
+  %j.next = add nuw nsw i64 %j, 1
+  %inner.exitcond = icmp eq i64 %j.next, %M
+  br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+  %a.lcssa = phi float [ 0x0, %loop.header ], [ %a.next, %inner.loop ]
+  %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+  store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !1
+  %i.next = add nuw nsw i64 %i, 1
+  %loop.exitcond = icmp eq i64 %i.next, %N
+  br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.parallel_accesses", !1}}
+!1 = distinct !{}

>From 78a89034e061cf16ba22e478ee3edeeb09b55362 Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer at sipearl.com>
Date: Thu, 20 Feb 2025 21:38:40 +0100
Subject: [PATCH 5/5] [LV] Outer-loop vectorization in the default vectorizer
 codepath

---
 .../Vectorize/LoopVectorizationLegality.h     |   4 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  66 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 275 +++++-
 .../Transforms/Vectorize/VPRecipeBuilder.h    |  20 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   7 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   8 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  40 +-
 .../outer-loop-vect-in-classic-path.ll        | 831 ++++++++++++++++++
 .../outer-loop-vect-in-classic-path.ll        | 647 ++++++++++++++
 9 files changed, 1843 insertions(+), 55 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index e959d93b57275..871a79d081719 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -642,6 +642,10 @@ class LoopVectorizationLegality {
   /// Keep track of the loop edge to an uncountable exit, comprising a pair
   /// of (Exiting, Exit) blocks, if there is exactly one early exit.
   std::optional<std::pair<BasicBlock *, BasicBlock *>> UncountableEdge;
+
+  /// Contains true for a nested loop if it or any of its parents up
+  /// to the loop to vectorize needs a inner-loop active lane mask.
+  mutable DenseMap<const Loop *, bool> InnerLoopsNeedingPredication;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 420cbc5384ce4..1b107179ba4ee 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -572,6 +572,11 @@ bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
   if (VF.isScalar())
     return true;
 
+  // The SCEVAddRecForUniformityRewriter does not support accesses to addresses
+  // invariant w.r.t. the vectorized loop but with recurrences of inner loops.
+  if (!TheLoop->isInnermost())
+    return false;
+
   // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
   // never considered uniform.
   auto *SE = PSE.getSE();
@@ -1207,8 +1212,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     });
   }
 
-  if (!LAI->canVectorizeMemory())
-    return canVectorizeIndirectUnsafeDependences();
+  if (!LAI->canVectorizeMemory()) {
+    if (canVectorizeIndirectUnsafeDependences())
+      return true;
+
+    return false;
+  }
 
   if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
     reportVectorizationFailure("We don't allow storing to uniform addresses",
@@ -1403,7 +1412,31 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
         "Uncountable exiting block must be a direct predecessor of latch");
     return BB == Latch;
   }
-  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+
+  if (LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT))
+    return true;
+
+  // Blocks in inner loops need predication if the inner loop trip-count
+  // is not invariant to the vectorized loop.
+  if (!TheLoop->isInnermost()) {
+    Loop *BBLoop = LI->getLoopFor(BB);
+    if (BBLoop != TheLoop) {
+      if (auto Iter = InnerLoopsNeedingPredication.find(BBLoop);
+          Iter != InnerLoopsNeedingPredication.end())
+        return Iter->second;
+
+      for (Loop *L = BBLoop; L != TheLoop; L = L->getParentLoop())
+        if (!isUniformLoop(L, TheLoop)) {
+          InnerLoopsNeedingPredication[BBLoop] = true;
+          return true;
+        }
+
+      InnerLoopsNeedingPredication[BBLoop] = false;
+      return false;
+    }
+  }
+
+  return false;
 }
 
 bool LoopVectorizationLegality::blockCanBePredicated(
@@ -1537,9 +1570,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 // Helper function to canVectorizeLoopNestCFG.
 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
                                                     bool UseVPlanNativePath) {
-  assert((UseVPlanNativePath || Lp->isInnermost()) &&
-         "VPlan-native path is not enabled.");
-
   // TODO: ORE should be improved to show more accurate information when an
   // outer loop can't be vectorized because a nested loop is not understood or
   // legal. Something like: "outer_loop_location: loop not vectorized:
@@ -1573,6 +1603,23 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
       return false;
   }
 
+  if (Lp != TheLoop && !UseVPlanNativePath) {
+    // Inner loops must be in loop-simplify form with the latch block being
+    // also the only exiting block and a dedicated exit.
+    BasicBlock *Exiting = Lp->getExitingBlock();
+    if (!Lp->isLoopSimplifyForm() || !Exiting ||
+        Exiting != Lp->getLoopLatch() || !Lp->isLCSSAForm(*DT)) {
+      reportVectorizationFailure(
+          "The inner loops must exit through their latch",
+          "loop control flow is not understood by vectorizer",
+          "CFGNotUnderstood", ORE, TheLoop);
+      if (DoExtraAnalysis)
+        Result = false;
+      else
+        return false;
+    }
+  }
+
   return Result;
 }
 
@@ -1775,9 +1822,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
 
   // Specific checks for outer loops. We skip the remaining legal checks at this
   // point because they don't support outer loops.
-  if (!TheLoop->isInnermost()) {
-    assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
-
+  if (!TheLoop->isInnermost() && UseVPlanNativePath) {
     if (!canVectorizeOuterLoop()) {
       reportVectorizationFailure("Unsupported outer loop",
                                  "UnsupportedOuterLoop", ORE, TheLoop);
@@ -1790,7 +1835,6 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
     return Result;
   }
 
-  assert(TheLoop->isInnermost() && "Inner loop expected.");
   // Check if we can if-convert non-single-bb loops.
   unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
@@ -1811,7 +1855,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   }
 
   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
-    if (TheLoop->getExitingBlock()) {
+    if (TheLoop->getExitingBlock() || !TheLoop->isInnermost()) {
       reportVectorizationFailure("Cannot vectorize uncountable loop",
                                  "UnsupportedUncountableLoop", ORE, TheLoop);
       if (DoExtraAnalysis)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8a5db28ea0a4..555135a73ce28 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -58,6 +58,7 @@
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
+#include "VPlanDominatorTree.h"
 #include "VPlanHCFGBuilder.h"
 #include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
@@ -401,6 +402,11 @@ static cl::opt<bool> EnableEarlyExitVectorization(
     cl::desc(
         "Enable vectorization of early exit loops with uncountable exits."));
 
+static cl::opt<bool> ExperimentalOLVInClassicPath(
+    "experimental-olv-in-classic-vect", cl::init(false), cl::Hidden,
+    cl::desc("Enable experimental outer-loop vectorization outside the "
+             "VPlan-native path."));
+
 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
 // variables not overflowing do not hold. See `emitSCEVChecks`.
 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -1085,9 +1091,8 @@ class LoopVectorizationCostModel {
     assert(VF.isVector() &&
            "Profitable to scalarize relevant only for VF > 1.");
     assert(
-        TheLoop->isInnermost() &&
+        (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
         "cost-model should not be used for outer loops (in VPlan-native path)");
-
     auto Scalars = InstsToScalarize.find(VF);
     assert(Scalars != InstsToScalarize.end() &&
            "VF not yet analyzed for scalarization profitability");
@@ -1097,7 +1102,7 @@ class LoopVectorizationCostModel {
   /// Returns true if \p I is known to be uniform after vectorization.
   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
     assert(
-        TheLoop->isInnermost() &&
+        (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
         "cost-model should not be used for outer loops (in VPlan-native path)");
     // Pseudo probe needs to be duplicated for each unrolled iteration and
     // vector lane so that profiled loop trip count can be accurately
@@ -1117,7 +1122,7 @@ class LoopVectorizationCostModel {
   /// Returns true if \p I is known to be scalar after vectorization.
   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
     assert(
-        TheLoop->isInnermost() &&
+        (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
         "cost-model should not be used for outer loops (in VPlan-native path)");
     if (VF.isScalar())
       return true;
@@ -1190,7 +1195,7 @@ class LoopVectorizationCostModel {
   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
     assert(VF.isVector() && "Expected VF to be a vector VF");
     assert(
-        TheLoop->isInnermost() &&
+        (TheLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
         "cost-model should not be used for outer loops (in VPlan-native path)");
 
     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
@@ -2205,7 +2210,7 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp,
     return false;
   }
 
-  if (Hints.getInterleave() > 1) {
+  if (Hints.getInterleave() > 1 && EnableVPlanNativePath) {
     // TODO: Interleave support is future work.
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
                          "outer loops.\n");
@@ -2224,7 +2229,8 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
   // are stress testing the VPlan H-CFG construction, we collect the outermost
   // loop of every loop nest.
   if (L.isInnermost() || VPlanBuildStressTest ||
-      (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
+      ((EnableVPlanNativePath || ExperimentalOLVInClassicPath) &&
+       isExplicitVecOuterLoop(&L, ORE))) {
     LoopBlocksRPO RPOT(&L);
     RPOT.perform(LI);
     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
@@ -2932,7 +2938,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
 
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   // Fix widened non-induction PHIs by setting up the PHI operands.
-  if (EnableVPlanNativePath)
+  if (EnableVPlanNativePath || ExperimentalOLVInClassicPath)
     fixNonInductionPHIs(State);
 
   // After vectorization, the exit blocks of the original loop will have
@@ -3675,6 +3681,31 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         HasUniformUse.insert(Ptr);
     }
 
+  if (!TheLoop->isInnermost()) {
+    SmallVector<Loop *> Loops(ArrayRef(TheLoop->getSubLoops()));
+    while (!Loops.empty()) {
+      auto *Lp = Loops.pop_back_val();
+      // Inner-loop inductions can be uniform, as well as their backedge value.
+      for (PHINode &Phi : Lp->getHeader()->phis())
+        if (Legal->isInvariant(&Phi)) {
+          AddToWorklistIfAllowed(&Phi);
+          auto *BackedgeVal = Phi.getIncomingValueForBlock(Lp->getLoopLatch());
+          assert(Legal->isInvariant(BackedgeVal));
+          if (auto *I = dyn_cast<Instruction>(BackedgeVal))
+            AddToWorklistIfAllowed(I);
+        }
+
+      // The exit condition of a inner loop can be uniform.
+      auto *Br = cast<BranchInst>(Lp->getLoopLatch()->getTerminator());
+      auto *ICmp = dyn_cast<ICmpInst>(Br->getCondition());
+      if (ICmp && Legal->isInvariant(ICmp->getOperand(0)) &&
+          Legal->isInvariant(ICmp->getOperand(1)))
+        AddToWorklistIfAllowed(ICmp);
+
+      Loops.append(Lp->getSubLoops().begin(), Lp->getSubLoops().end());
+    }
+  }
+
   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
   // demanding) users.  Since loops are assumed to be in LCSSA form, this
   // disallows uses outside the loop as well.
@@ -6408,14 +6439,23 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
   if (!Legal->isInvariant(Op))
     return false;
+
   // Consider Op invariant, if it or its operands aren't predicated
   // instruction in the loop. In that case, it is not trivially hoistable.
   auto *OpI = dyn_cast<Instruction>(Op);
-  return !OpI || !TheLoop->contains(OpI) ||
-         (!isPredicatedInst(OpI) &&
-          (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
-          all_of(OpI->operands(),
-                 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
+  if (!OpI || !TheLoop->contains(OpI))
+    return true;
+
+  // Be pessimistic in case of inner loops and do not assume things are
+  // invariant. The approach below results in a endless loop in case a
+  // inner-loop header PHI is part of the operands.
+  if (!TheLoop->isInnermost())
+    return false;
+
+  return !isPredicatedInst(OpI) &&
+         (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
+         all_of(OpI->operands(),
+                [this](Value *Op) { return shouldConsiderInvariant(Op); });
 }
 
 InstructionCost
@@ -7134,7 +7174,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 }
 
 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
-  assert(OrigLoop->isInnermost() && "Inner loop expected.");
+  assert((OrigLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
+         "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
 
@@ -7577,6 +7618,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
       BestPlan.getVectorLoopRegion()->getSingleSuccessor() !=
           BestPlan.getMiddleBlock();
   assert((BestFactor.Width == LegacyVF.Width || PlanForEarlyExitLoop ||
+          ExperimentalOLVInClassicPath ||
           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
                                                 CostCtx, OrigLoop) ||
           planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
@@ -8265,7 +8307,7 @@ void VPRecipeBuilder::createHeaderMask() {
   BlockMaskCache[Header] = BlockMask;
 }
 
-VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
+VPValue *VPRecipeBuilder::getBlockInMask(const BasicBlock *BB) const {
   // Return the cached value.
   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
   assert(BCEntryIt != BlockMaskCache.end() &&
@@ -8986,7 +9028,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
-  assert(OrigLoop->isInnermost() && "Inner loop expected.");
+  assert((OrigLoop->isInnermost() || ExperimentalOLVInClassicPath) &&
+         "Inner loop expected.");
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -9295,6 +9338,141 @@ static void addExitUsersForFirstOrderRecurrences(
   }
 }
 
+// Called before visiting the first instruction in the entry block
+// of the inner-loop region.
+static void enterInnerLoopRegion(VPlanHCFGBuilder &HCFGBuilder,
+                                 VPRecipeBuilder &RecipeBuilder,
+                                 VPRegionBlock &Region, ScalarEvolution &SE,
+                                 const Loop *TheLoop, const LoopInfo &LI) {
+  VPBasicBlock *Entry = Region.getEntryBasicBlock();
+  const Loop *InnerLoop = LI.getLoopFor(HCFGBuilder.getIRBBForVPB(Entry));
+  assert(InnerLoop->isLoopSimplifyForm() && InnerLoop->getNumBackEdges() == 1 &&
+         InnerLoop->getExitingBlock());
+
+  // Handle the inner-loop header phis.
+  const BasicBlock *IRPreheader = InnerLoop->getLoopPreheader();
+  for (VPRecipeBase &R : Entry->phis()) {
+    // TODO: If the phi has only uniform users (can happen for inner-loop
+    // inductions), then creating a scalar phi instead would be
+    // beneficial, or even a scalar and a widened phi in case the inner-loop
+    // induction has uniform and non-uniform users.
+    auto *Phi = cast<VPWidenPHIRecipe>(&R);
+    auto *IRPhi = cast<PHINode>(Phi->getUnderlyingValue());
+    Phi->setOperand(0, RecipeBuilder.getVPValueOrAddLiveIn(
+                           IRPhi->getIncomingValueForBlock(IRPreheader)));
+
+    // This will ensure that this instruction is kept and not replaced when
+    // the entry block instructions are visited.
+    RecipeBuilder.setRecipe(IRPhi, Phi);
+  }
+
+  // Handle predication for the inner loop.
+  VPValue *PreheaderMask = RecipeBuilder.getBlockInMask(IRPreheader);
+  const SCEV *BTC = SE.getBackedgeTakenCount(InnerLoop);
+  bool NeedsActiveLaneMask =
+      !isa<SCEVCouldNotCompute>(BTC) && SE.isLoopInvariant(BTC, TheLoop);
+  if (NeedsActiveLaneMask) {
+    auto *InnerALM = new VPWidenPHIRecipe(nullptr);
+    if (!PreheaderMask)
+      PreheaderMask = Region.getPlan()->getOrAddLiveIn(
+          ConstantInt::getTrue(SE.getContext()));
+    // The backedge value will be filled in when the exit block of the
+    // region is visted.
+    InnerALM->addOperand(PreheaderMask);
+    InnerALM->insertBefore(*Entry, Entry->getFirstNonPhi());
+    RecipeBuilder.setBlockInMask(InnerLoop->getHeader(), InnerALM);
+  } else {
+    RecipeBuilder.setBlockInMask(InnerLoop->getHeader(), PreheaderMask);
+  }
+}
+
+// Called after the exiting block of the region is visited before
+// visiting the exit block.
+static void exitInnerLoopRegion(VPlanHCFGBuilder &HCFGBuilder,
+                                VPRecipeBuilder &RecipeBuilder,
+                                VPRegionBlock &Region) {
+
+  auto *Entry = Region.getEntryBasicBlock();
+  auto *Exiting = Region.getExitingBasicBlock();
+  const auto *IRHeader = HCFGBuilder.getIRBBForVPB(Entry);
+  const auto *IRBr =
+      cast<BranchInst>(HCFGBuilder.getIRBBForVPB(Exiting)->getTerminator());
+  bool ExitIfTrue = IRBr->getSuccessor(1) == IRHeader;
+
+  // Create the inner-loop exit condition and the backedge value for the
+  // inner-loop active-lane mask (if needed).
+  VPValue *ExitCond = RecipeBuilder.getVPValueOrAddLiveIn(IRBr->getCondition());
+  auto *ALM = dyn_cast_or_null<VPWidenPHIRecipe>(
+      RecipeBuilder.getBlockInMask(IRHeader));
+  VPBuilder Builder(Exiting, Exiting->end());
+  DebugLoc DL = IRBr->getDebugLoc();
+  if (ALM && ALM->getParent() == Entry) {
+    assert(!ALM->getUnderlyingValue() && ALM->getNumOperands() == 1);
+    if (ExitIfTrue)
+      ExitCond = Builder.createNot(ExitCond, DL);
+
+    auto *ALMBackedgeVal = Builder.createLogicalAnd(ALM, ExitCond, DL);
+    ALM->addOperand(ALMBackedgeVal);
+    auto *Any =
+        Builder.createNaryOp(VPInstruction::AnyOf, {ALMBackedgeVal}, DL);
+    ExitCond = Builder.createNot(Any, DL);
+  } else if (!ExitIfTrue) {
+    ExitCond = Builder.createNot(ExitCond, DL);
+  }
+  Builder.createNaryOp(VPInstruction::BranchOnCond, {ExitCond}, DL);
+
+  // Set the backedge values of the inner-loop header phis.
+  const auto *IRPreheader =
+      HCFGBuilder.getIRBBForVPB(Region.getSinglePredecessor());
+  for (VPRecipeBase &R : Entry->phis()) {
+    auto *Phi = cast<VPWidenPHIRecipe>(&R);
+    if (Phi == ALM)
+      continue;
+
+    auto *IRPhi = cast<PHINode>(Phi->getUnderlyingValue());
+    Phi->setOperand(1, RecipeBuilder.getVPValueOrAddLiveIn(
+                           IRPhi->getIncomingValueForBlock(IRBr->getParent())));
+  }
+
+  // Handle the LCSSA phis for inner-loop live-out values.
+  auto *ExitBlock = cast<VPBasicBlock>(Region.getSingleSuccessor());
+  for (VPRecipeBase &R : ExitBlock->phis()) {
+    auto *Phi = cast<VPWidenPHIRecipe>(&R);
+    auto *IRPhi = cast<PHINode>(Phi->getUnderlyingValue());
+    assert(Phi->getNumOperands() == 1);
+    RecipeBuilder.setRecipe(IRPhi, Phi);
+    VPValue *OutVal =
+        RecipeBuilder.getVPValueOrAddLiveIn(IRPhi->getIncomingValue(0));
+    VPRecipeBase *OutValDef = OutVal->getDefiningRecipe();
+    if (OutValDef && OutValDef->getParent()->getParent() == &Region && ALM &&
+        ALM->getParent() == Entry) {
+      // In case there is a inner-loop active-lane mask, the live out value of
+      // the inner loop for a vector must contain the values of the last
+      // iteration where that lane was active. For this, a new phi is created
+      // that passes through the value from the last iteration if the lane is
+      // inactive and the current one if not.
+      auto *PassthroughPhi = new VPWidenPHIRecipe(IRPhi);
+      PassthroughPhi->addOperand(
+          Region.getPlan()->getOrAddLiveIn(PoisonValue::get(IRPhi->getType())));
+      PassthroughPhi->insertBefore(*Entry, Entry->getFirstNonPhi());
+
+      auto *Select =
+          new VPInstruction(Instruction::Select, {ALM, OutVal, PassthroughPhi},
+                            OutValDef->getDebugLoc());
+      Select->insertAfter(OutValDef);
+
+      PassthroughPhi->addOperand(Select);
+      OutVal = Select;
+    }
+
+    Phi->setOperand(0, OutVal);
+  }
+
+  // The mask of the exit block should be that of the preheader.
+  RecipeBuilder.setBlockInMask(HCFGBuilder.getIRBBForVPB(ExitBlock),
+                               RecipeBuilder.getBlockInMask(IRPreheader));
+}
+
 VPlanPtr
 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
@@ -9378,9 +9556,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
   VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+
   BasicBlock *HeaderBB = OrigLoop->getHeader();
   bool NeedsMasks =
-      CM.foldTailByMasking() ||
+      CM.foldTailByMasking() || !OrigLoop->isInnermost() ||
       any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
         return Legal->blockNeedsPredication(BB) || NeedsBlends;
@@ -9392,12 +9571,30 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
-  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       HeaderVPBB);
 
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
   VPBlockBase *PrevVPBB = nullptr;
-  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+  for (VPBlockBase *VPBlock : RPOT) {
+    // Handle the entering into a new inner loop.
+    if (auto *Region = dyn_cast<VPRegionBlock>(VPBlock)) {
+      assert(ExperimentalOLVInClassicPath);
+      enterInnerLoopRegion(HCFGBuilder, RecipeBuilder, *Region, *PSE.getSE(),
+                           OrigLoop, *LI);
+
+      // The inner-loop region can keep its successor connection and should be
+      // connected to its RPO predecessor, but when visiting the entry block of
+      // the inner loop, there should be no connection to the RPO predecessor.
+      assert(Region->getNumSuccessors() == 1 && PrevVPBB &&
+             "Invalid inner loop (expected preheader and dedicated exit)");
+      VPBlockUtils::connectBlocks(PrevVPBB, Region);
+      PrevVPBB = nullptr;
+      continue;
+    }
+
+    VPBasicBlock *VPBB = cast<VPBasicBlock>(VPBlock);
+
     // Handle VPBBs down to the latch.
     if (VPBB == LoopRegion->getExiting()) {
       assert(!HCFGBuilder.getIRBBForVPB(VPBB) &&
@@ -9409,7 +9606,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     // Create mask based on the IR BB corresponding to VPBB.
     // TODO: Predicate directly based on VPlan.
     Builder.setInsertPoint(VPBB, VPBB->begin());
-    if (VPBB == HeaderVPBB) {
+    if (RecipeBuilder.hasBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB))) {
+      Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
+    } else if (VPBB == HeaderVPBB) {
       Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
       RecipeBuilder.createHeaderMask();
     } else if (NeedsMasks) {
@@ -9429,7 +9628,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       // FIXME: Migrate code relying on the underlying instruction from VPlan0
       // to construct recipes below to not use the underlying instruction.
       if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
-          (isa<VPInstruction>(&R) && !UnderlyingValue))
+          (isa<VPInstruction>(&R) && !UnderlyingValue) ||
+          (isa<VPWidenPHIRecipe>(&R) &&
+           (!UnderlyingValue ||
+            RecipeBuilder.hasRecipe(cast<Instruction>(UnderlyingValue)))))
         continue;
 
       // FIXME: VPlan0, which models a copy of the original scalar loop, should
@@ -9451,6 +9653,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       Builder.setInsertPoint(SingleDef);
       SmallVector<VPValue *, 4> Operands;
       auto *Phi = dyn_cast<PHINode>(Instr);
+      if (Phi && RecipeBuilder.hasRecipe(Phi))
+        // Skip over LCSSA or inner-loop header phis.
+        continue;
+
       if (Phi && Phi->getParent() == HeaderBB) {
         // The backedge value will be added in fixHeaderPhis later.
         Operands.push_back(Plan->getOrAddLiveIn(
@@ -9498,6 +9704,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       R.eraseFromParent();
     }
 
+    // Handle the exit of a inner loop region.
+    if (auto *Region = VPBB->getParent();
+        Region && Region->getExiting() == VPBB) {
+      exitInnerLoopRegion(HCFGBuilder, RecipeBuilder, *Region);
+
+      if (PrevVPBB)
+        VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+
+      // The region will already be connected to its single successor.
+      assert(Region->getNumSuccessors() == 1 && VPBB->getNumSuccessors() == 0);
+      PrevVPBB = nullptr;
+      continue;
+    }
+
     // Flatten the CFG in the loop. Masks for blocks have already been generated
     // and added to recipes as needed. To do so, first disconnect VPBB from its
     // successors. Then connect VPBB to the previously visited VPBB.
@@ -10460,9 +10680,6 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
 }
 
 bool LoopVectorizePass::processLoop(Loop *L) {
-  assert((EnableVPlanNativePath || L->isInnermost()) &&
-         "VPlan-native path is not enabled. Only process inner loops.");
-
   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
                     << L->getHeader()->getParent()->getName() << "' from "
                     << L->getLocStr() << "\n");
@@ -10520,11 +10737,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // even evaluating whether vectorization is profitable. Since we cannot modify
   // the incoming IR, we need to build VPlan upfront in the vectorization
   // pipeline.
-  if (!L->isInnermost())
+  //
+  // The normal vectorization codepath now also has experimental support for
+  // outer-loop vectorization.
+  if (!L->isInnermost() && EnableVPlanNativePath)
     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
                                         ORE, BFI, PSI, Hints, Requirements);
 
-  assert(L->isInnermost() && "Inner loop expected.");
+  assert((L->isInnermost() || ExperimentalOLVInClassicPath) &&
+         "Inner loop expected.");
 
   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
@@ -10534,7 +10755,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     UseInterleaved = EnableInterleavedMemAccesses;
 
   // Analyze interleaved memory accesses.
-  if (UseInterleaved)
+  if (UseInterleaved && L->isInnermost())
     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
 
   if (LVL.hasUncountableEarlyExit()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index e8d3ad89e14cf..464f43927f780 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -196,7 +196,17 @@ class VPRecipeBuilder {
   void createBlockInMask(BasicBlock *BB);
 
   /// Returns the *entry* mask for the block \p BB.
-  VPValue *getBlockInMask(BasicBlock *BB) const;
+  VPValue *getBlockInMask(const BasicBlock *BB) const;
+
+  /// Returns true if there already is a block-in mask for \p BB.
+  bool hasBlockInMask(BasicBlock *BB) const {
+    return BlockMaskCache.contains(BB);
+  }
+
+  /// Set the block-in mask of \p BB directly.
+  void setBlockInMask(BasicBlock *BB, VPValue *Mask) {
+    BlockMaskCache[BB] = Mask;
+  }
 
   /// Create an edge mask for every destination of cases and/or default.
   void createSwitchEdgeMasks(SwitchInst *SI);
@@ -225,6 +235,14 @@ class VPRecipeBuilder {
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range);
 
+  /// Return true if there already is a recipe for the given ingredient.
+  bool hasRecipe(Instruction *I) const { return Ingredient2Recipe.contains(I); }
+
+  /// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as
+  /// last operand. Range.End may be decreased to ensure same recipe behavior
+  /// from \p Range.Start to \p Range.End.
+  VPReplicateRecipe *handleReplication(Instruction *I, VFRange &Range);
+
   /// Add the incoming values from the backedge to reduction & first-order
   /// recurrence cross-iteration phis.
   void fixHeaderPhis();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index cd111365c134c..ac8823df0c2f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -970,7 +970,12 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) &&
+  // When outer-loop vectorizing and the trip-count is known, it is possible
+  // that VPlanTransforms::optimizeForVFAndUF() destroys the vector loop region,
+  // but getVectorLoopRegion() will falsely return the inner loop region.
+  assert((!getVectorLoopRegion() || VFxUF.getNumUsers() ||
+          !State.LI->getLoopFor(getScalarHeader()->getIRBasicBlock())
+               ->isInnermost()) &&
          "VFxUF expected to always have users");
   unsigned UF = getUF();
   if (VF.getNumUsers()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 15e90bc18bc87..f0786d3d9e529 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1994,11 +1994,17 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe {
              VPSlotTracker &SlotTracker) const override;
 #endif
 
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
   /// Returns the \p I th incoming VPBasicBlock.
   VPBasicBlock *getIncomingBlock(unsigned I);
 
   /// Returns the \p I th incoming VPValue.
-  VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
+  VPValue *getIncomingValue(unsigned I) const { return getOperand(I); }
+
+  /// Return the incoming VPValue for the predecessor \p BB.
+  VPValue *getIncomingValueForBlock(const VPBasicBlock *BB) const;
 };
 
 /// A recipe for handling first-order recurrence phis. The start value is the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d57a6c481748c..42a918e8c76d9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3642,10 +3642,25 @@ VPBasicBlock *VPWidenPHIRecipe::getIncomingBlock(unsigned I) {
   return Pred->getExitingBasicBlock();
 }
 
-void VPWidenPHIRecipe::execute(VPTransformState &State) {
-  assert(EnableVPlanNativePath &&
-         "Non-native vplans are not expected to have VPWidenPHIRecipes.");
+VPValue *
+VPWidenPHIRecipe::getIncomingValueForBlock(const VPBasicBlock *BB) const {
+  const VPBasicBlock *Parent = getParent();
+  const VPRegionBlock *Region = Parent->getParent();
+  if (Region && Region->getEntryBasicBlock() == Parent) {
+    if (Region->getSinglePredecessor() == BB)
+      return getOperand(0);
+    if (Region->getExitingBasicBlock() == BB)
+      return getOperand(1);
+  }
+
+  for (unsigned I = 0; I < Parent->getNumPredecessors(); I++)
+    if (Parent->getPredecessors()[I] == BB)
+      return getOperand(I);
 
+  return nullptr;
+}
+
+void VPWidenPHIRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   Value *Op0 = State.get(getOperand(0));
   Type *VecTy = Op0->getType();
@@ -3657,23 +3672,20 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) {
 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                              VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-PHI ";
-
-  auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
-  // Unless all incoming values are modeled in VPlan  print the original PHI
-  // directly.
-  // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
-  // values as VPValues.
-  if (getNumOperands() != OriginalPhi->getNumOperands()) {
-    O << VPlanIngredient(OriginalPhi);
-    return;
-  }
-
   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
 }
 #endif
 
+InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
+                                              VPCostContext &Ctx) const {
+  if (getNumOperands() == 1)
+    return 0; // LCSSA Phis can be considered free.
+
+  return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+}
+
 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
 // remove VPActiveLaneMaskPHIRecipe.
 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll
new file mode 100644
index 0000000000000..bed6c3ece93a6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer-loop-vect-in-classic-path.ll
@@ -0,0 +1,831 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=aarch64 -mattr=+sve -passes=loop-vectorize,instcombine,simplifycfg \
+; RUN:     -force-vector-interleave=1 -experimental-olv-in-classic-vect \
+; RUN:     -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck -check-prefix=CHECK-IC1 %s
+; RUN: opt -S -mtriple=aarch64 -mattr=+sve -passes=loop-vectorize,instcombine,simplifycfg \
+; RUN:     -force-vector-interleave=2 -experimental-olv-in-classic-vect \
+; RUN:     -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck -check-prefix=CHECK-IC2 %s
+
+;;; Effectively the inner two loops of:
+; for (size_t i = 0; i < N; i++) {
+;   #pragma clang loop vectorize(enable)
+;   for (size_t j = 0; j < N; j++) {
+;     float a = 0.;
+;     for (size_t k = 0; k < N; k++)
+;       a += B[i][k] * C[k][j];
+;     A[i][j] = a;
+;   }
+; }
+define void @foo(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) {
+; CHECK-IC1-LABEL: define void @foo(
+; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-IC1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-IC1:       [[VECTOR_BODY]]:
+; CHECK-IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH9:.*]] ]
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH9]] ]
+; CHECK-IC1-NEXT:    br label %[[INNER_LOOP1:.*]]
+; CHECK-IC1:       [[INNER_LOOP1]]:
+; CHECK-IC1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT12:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP13:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK]], %[[VECTOR_BODY]] ], [ [[TMP19:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_BODY]] ], [ [[TMP14:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
+; CHECK-IC1-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[TMP7]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], [[M]]
+; CHECK-IC1-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP9]]
+; CHECK-IC1-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i64 [[INDEX]]
+; CHECK-IC1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[VEC_PHI3]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[TMP12:%.*]] = fmul <vscale x 4 x float> [[BROADCAST_SPLAT]], [[WIDE_MASKED_LOAD]]
+; CHECK-IC1-NEXT:    [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI2]], [[TMP12]]
+; CHECK-IC1-NEXT:    [[TMP14]] = select <vscale x 4 x i1> [[VEC_PHI3]], <vscale x 4 x float> [[TMP13]], <vscale x 4 x float> [[VEC_PHI4]]
+; CHECK-IC1-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[TMP15]], 1
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP16]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT12]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT11]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[TMP16]], [[M]]
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP17]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP18:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT6]], splat (i1 true)
+; CHECK-IC1-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[VEC_PHI3]], <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP19]])
+; CHECK-IC1-NEXT:    br i1 [[TMP20]], label %[[INNER_LOOP1]], label %[[LOOP_LATCH9]]
+; CHECK-IC1:       [[LOOP_LATCH9]]:
+; CHECK-IC1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC1-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP14]], ptr [[TMP21]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC1-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC1-NEXT:    br i1 [[TMP22]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-IC1:       [[EXIT]]:
+; CHECK-IC1-NEXT:    ret void
+;
+; CHECK-IC2-LABEL: define void @foo(
+; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-IC2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
+; CHECK-IC2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 3
+; CHECK-IC2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP6]], i64 [[N]])
+; CHECK-IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-IC2:       [[VECTOR_BODY]]:
+; CHECK-IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH22:.*]] ]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH22]] ]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT25:%.*]], %[[LOOP_LATCH22]] ]
+; CHECK-IC2-NEXT:    br label %[[INNER_LOOP3:.*]]
+; CHECK-IC2:       [[INNER_LOOP3]]:
+; CHECK-IC2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT27:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT29:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP21:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP22:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK]], %[[VECTOR_BODY]] ], [ [[TMP33:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK2]], %[[VECTOR_BODY]] ], [ [[TMP34:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI9:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_BODY]] ], [ [[TMP23:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI10:%.*]] = phi <vscale x 4 x float> [ shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float poison, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %[[VECTOR_BODY]] ], [ [[TMP24:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[TMP40:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP40]]
+; CHECK-IC2-NEXT:    [[TMP41:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI4]], i64 0
+; CHECK-IC2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]]
+; CHECK-IC2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP7]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[TMP14]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP42]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <vscale x 4 x float> poison, float [[TMP12]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT12]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP13]], [[M]]
+; CHECK-IC2-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP9]]
+; CHECK-IC2-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i64 [[INDEX]]
+; CHECK-IC2-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[DOTIDX:%.*]] = shl i64 [[TMP17]], 4
+; CHECK-IC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[DOTIDX]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[VEC_PHI7]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[VEC_PHI8]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[TMP19:%.*]] = fmul <vscale x 4 x float> [[BROADCAST_SPLAT]], [[WIDE_MASKED_LOAD]]
+; CHECK-IC2-NEXT:    [[TMP20:%.*]] = fmul <vscale x 4 x float> [[BROADCAST_SPLAT13]], [[WIDE_MASKED_LOAD11]]
+; CHECK-IC2-NEXT:    [[TMP21]] = fadd <vscale x 4 x float> [[VEC_PHI5]], [[TMP19]]
+; CHECK-IC2-NEXT:    [[TMP22]] = fadd <vscale x 4 x float> [[VEC_PHI6]], [[TMP20]]
+; CHECK-IC2-NEXT:    [[TMP23]] = select <vscale x 4 x i1> [[VEC_PHI7]], <vscale x 4 x float> [[TMP21]], <vscale x 4 x float> [[VEC_PHI9]]
+; CHECK-IC2-NEXT:    [[TMP24]] = select <vscale x 4 x i1> [[VEC_PHI8]], <vscale x 4 x float> [[TMP22]], <vscale x 4 x float> [[VEC_PHI10]]
+; CHECK-IC2-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT:    [[TMP26:%.*]] = add nuw nsw i64 [[TMP25]], 1
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP26]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT27]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT26]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI4]], i64 0
+; CHECK-IC2-NEXT:    [[TMP28:%.*]] = add nuw nsw i64 [[TMP27]], 1
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP28]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT29]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT28]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[TMP26]], [[M]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP29]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT14]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[M]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP30]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT16]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP31:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT15]], splat (i1 true)
+; CHECK-IC2-NEXT:    [[TMP32:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
+; CHECK-IC2-NEXT:    [[TMP33]] = select <vscale x 4 x i1> [[VEC_PHI7]], <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP34]] = select <vscale x 4 x i1> [[VEC_PHI8]], <vscale x 4 x i1> [[TMP32]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP33]])
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP35]], i64 0
+; CHECK-IC2-NEXT:    [[TMP36:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP34]])
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP36]], i64 0
+; CHECK-IC2-NEXT:    [[TMP37:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLATINSERT18]], [[BROADCAST_SPLATINSERT20]]
+; CHECK-IC2-NEXT:    [[TMP38:%.*]] = shufflevector <vscale x 4 x i1> [[TMP37]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP39:%.*]] = extractelement <vscale x 4 x i1> [[TMP38]], i64 0
+; CHECK-IC2-NEXT:    br i1 [[TMP39]], label %[[INNER_LOOP3]], label %[[LOOP_LATCH22]]
+; CHECK-IC2:       [[LOOP_LATCH22]]:
+; CHECK-IC2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC2-NEXT:    [[TMP48:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[DOTIDX30:%.*]] = shl i64 [[TMP48]], 4
+; CHECK-IC2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i8, ptr [[TMP47]], i64 [[DOTIDX30]]
+; CHECK-IC2-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP23]], ptr [[TMP47]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP24]], ptr [[TMP49]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC2-NEXT:    [[TMP43:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP44:%.*]] = shl i64 [[TMP43]], 2
+; CHECK-IC2-NEXT:    [[TMP45:%.*]] = add i64 [[INDEX]], [[TMP44]]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_NEXT25]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP45]], i64 [[TMP4]])
+; CHECK-IC2-NEXT:    [[TMP46:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC2-NEXT:    br i1 [[TMP46]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-IC2:       [[EXIT]]:
+; CHECK-IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+  br label %inner.loop
+
+inner.loop:
+  %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+  %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+  %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+  %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3
+  %jxM = mul i64 %j, %M
+  %jxMpi = add i64 %jxM, %i
+  %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+  %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3
+  %mul = fmul float %b.load, %c.load
+  %a.next = fadd float %a.phi, %mul
+  %j.next = add nuw nsw i64 %j, 1
+  %inner.exitcond = icmp eq i64 %j.next, %M
+  br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+  %a.lcssa = phi float [ %a.next, %inner.loop ]
+  %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+  store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3
+  %i.next = add nuw nsw i64 %i, 1
+  %loop.exitcond = icmp eq i64 %i.next, %N
+  br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+;;; Effectively the inner two loops of:
+; for (size_t i = 0; i < N; i++) {
+;   #pragma clang loop vectorize(enable)
+;   for (size_t j = 0; j < N; j++) {
+;     float a = 0.;
+;     for (size_t k = 0; k < j; k++)
+;       a += B[i][k] * C[k][j];
+;     A[i][j] = a;
+;   }
+; }
+;;; Note that the inner loop's trip-count depends on the outer loop.
+define void @bar(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) {
+; CHECK-IC1-LABEL: define void @bar(
+; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0]] {
+; CHECK-IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-IC1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-IC1:       [[VECTOR_BODY]]:
+; CHECK-IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH3:.*]] ]
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH3]] ]
+; CHECK-IC1-NEXT:    br label %[[INNER_LOOP1:.*]]
+; CHECK-IC1:       [[INNER_LOOP1]]:
+; CHECK-IC1-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT6:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP13:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI1]], i64 0
+; CHECK-IC1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP7]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI1]], i64 0
+; CHECK-IC1-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], [[M]]
+; CHECK-IC1-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP9]]
+; CHECK-IC1-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i64 [[INDEX]]
+; CHECK-IC1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[TMP12:%.*]] = fmul <vscale x 4 x float> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]]
+; CHECK-IC1-NEXT:    [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI3]], [[TMP12]]
+; CHECK-IC1-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI1]], i64 0
+; CHECK-IC1-NEXT:    [[TMP21:%.*]] = add i64 [[TMP15]], 1
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP21]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT6]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP21]], [[INDEX]]
+; CHECK-IC1-NEXT:    br i1 [[TMP14]], label %[[LOOP_LATCH3]], label %[[INNER_LOOP1]]
+; CHECK-IC1:       [[LOOP_LATCH3]]:
+; CHECK-IC1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC1-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP13]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC1-NEXT:    br i1 [[TMP20]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-IC1:       [[EXIT]]:
+; CHECK-IC1-NEXT:    ret void
+;
+; CHECK-IC2-LABEL: define void @bar(
+; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) #[[ATTR0]] {
+; CHECK-IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-IC2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 3
+; CHECK-IC2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 3
+; CHECK-IC2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
+; CHECK-IC2-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 2
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 [[N]])
+; CHECK-IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-IC2:       [[VECTOR_BODY]]:
+; CHECK-IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH11:.*]] ]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH11]] ]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT22:%.*]], %[[LOOP_LATCH11]] ]
+; CHECK-IC2-NEXT:    br label %[[INNER_LOOP3:.*]]
+; CHECK-IC2:       [[INNER_LOOP3]]:
+; CHECK-IC2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT16:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT18:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP29:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP30:%.*]], %[[INNER_LOOP3]] ]
+; CHECK-IC2-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP12]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI4]], i64 0
+; CHECK-IC2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP10]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP11]], [[M]]
+; CHECK-IC2-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP14]]
+; CHECK-IC2-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[INDEX]]
+; CHECK-IC2-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[DOTIDX:%.*]] = shl i64 [[TMP17]], 4
+; CHECK-IC2-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[DOTIDX]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]]
+; CHECK-IC2-NEXT:    [[TMP20:%.*]] = fmul <vscale x 4 x float> [[WIDE_MASKED_GATHER13]], [[WIDE_MASKED_LOAD14]]
+; CHECK-IC2-NEXT:    [[TMP29]] = fadd <vscale x 4 x float> [[VEC_PHI5]], [[TMP19]]
+; CHECK-IC2-NEXT:    [[TMP30]] = fadd <vscale x 4 x float> [[VEC_PHI6]], [[TMP20]]
+; CHECK-IC2-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], 1
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP22]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT16]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT15]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI4]], i64 0
+; CHECK-IC2-NEXT:    [[TMP24:%.*]] = add i64 [[TMP23]], 1
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP24]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT18]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT17]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[TMP22]], [[INDEX]]
+; CHECK-IC2-NEXT:    br i1 [[TMP25]], label %[[LOOP_LATCH11]], label %[[INNER_LOOP3]]
+; CHECK-IC2:       [[LOOP_LATCH11]]:
+; CHECK-IC2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC2-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[DOTIDX19:%.*]] = shl i64 [[TMP27]], 4
+; CHECK-IC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i64 [[DOTIDX19]]
+; CHECK-IC2-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP29]], ptr [[TMP26]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP30]], ptr [[TMP28]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-IC2-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP40:%.*]] = shl i64 [[TMP39]], 2
+; CHECK-IC2-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP5]])
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_NEXT22]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP41]], i64 [[TMP5]])
+; CHECK-IC2-NEXT:    [[TMP42:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC2-NEXT:    br i1 [[TMP42]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-IC2:       [[EXIT]]:
+; CHECK-IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+  br label %inner.loop
+
+inner.loop:
+  %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+  %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+  %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+  %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3
+  %jxM = mul i64 %j, %M
+  %jxMpi = add i64 %jxM, %i
+  %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+  %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3
+  %mul = fmul float %b.load, %c.load
+  %a.next = fadd float %a.phi, %mul
+  %j.next = add nuw nsw i64 %j, 1
+  %inner.exitcond = icmp eq i64 %j.next, %i
+  br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+  %a.lcssa = phi float [ %a.next, %inner.loop ]
+  %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+  store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3
+  %i.next = add nuw nsw i64 %i, 1
+  %loop.exitcond = icmp eq i64 %i.next, %N
+  br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+;;; Effectively something like:
+; #pragma clang loop vectorize(enable)
+; for (long i = 0; i < N; i++) {
+;   long a = A[i];
+;   long j = 0;
+;   if (a > 0) {
+;     do {
+;       a -= B[j];
+;       j++;
+;     } while (a > 0);
+;   }
+;   A[i] = a + j;
+; }
+;;; Note that the inner loop is behind a branch, so the start value of the inner
+;;; loop mask phi must be corespondingly. The induction of the inner loop is used
+;;; for a uniform memory accesses and as live-out, so the vectorized code should
+;;; contain two phis for it (one scalar and one widened).
+;;; Also, in this example, the inner loop backedge is the first successor of the
+;;; the latch terminator, not the second one as is assumed by VPlan.
+define void @baz(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B) {
+; CHECK-IC1-LABEL: define void @baz(
+; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] {
+; CHECK-IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-IC1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
+; CHECK-IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-IC1:       [[VECTOR_BODY]]:
+; CHECK-IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT3:.*]] ]
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT3]] ]
+; CHECK-IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP7:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
+; CHECK-IC1-NEXT:    br label %[[INNER_LOOP1:.*]]
+; CHECK-IC1:       [[INNER_LOOP1]]:
+; CHECK-IC1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT6:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP9:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-IC1-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP10]]
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP8]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[TMP9]] = sub <vscale x 2 x i64> [[VEC_PHI3]], [[WIDE_MASKED_GATHER]]
+; CHECK-IC1-NEXT:    [[J2:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[J2]], 1
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT6]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x i64> [[TMP9]], i64 0
+; CHECK-IC1-NEXT:    [[TMP12:%.*]] = icmp slt i64 [[TMP13]], 1
+; CHECK-IC1-NEXT:    br i1 [[TMP12]], label %[[LOOP_LATCH_LOOPEXIT3]], label %[[INNER_LOOP1]]
+; CHECK-IC1:       [[LOOP_LATCH_LOOPEXIT3]]:
+; CHECK-IC1-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
+; CHECK-IC1-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC1-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC1-NEXT:    br i1 [[TMP15]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-IC1:       [[EXIT]]:
+; CHECK-IC1-NEXT:    ret void
+;
+; CHECK-IC2-LABEL: define void @baz(
+; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] {
+; CHECK-IC2-NEXT:  [[ENTRY:.*]]:
+; CHECK-IC2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-IC2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-IC2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 1
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 [[N]])
+; CHECK-IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-IC2:       [[VECTOR_BODY]]:
+; CHECK-IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT11:.*]] ]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT11]] ]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], %[[LOOP_LATCH_LOOPEXIT11]] ]
+; CHECK-IC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[DOTIDX:%.*]] = shl i64 [[TMP8]], 4
+; CHECK-IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[DOTIDX]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[TMP10:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP11:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    br label %[[INNER_LOOP4:.*]]
+; CHECK-IC2:       [[INNER_LOOP4]]:
+; CHECK-IC2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT17:%.*]], %[[INNER_LOOP4]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT19:%.*]], %[[INNER_LOOP4]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP18:%.*]], %[[INNER_LOOP4]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP19:%.*]], %[[INNER_LOOP4]] ]
+; CHECK-IC2-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP14]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP15]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI5]], i64 0
+; CHECK-IC2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP16]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP17]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT8]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT9]], i32 8, <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[TMP18]] = sub <vscale x 2 x i64> [[VEC_PHI6]], [[WIDE_MASKED_GATHER]]
+; CHECK-IC2-NEXT:    [[TMP19]] = sub <vscale x 2 x i64> [[VEC_PHI7]], [[WIDE_MASKED_GATHER10]]
+; CHECK-IC2-NEXT:    [[J6:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT:    [[TMP20:%.*]] = add nuw nsw i64 [[J6]], 1
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT17]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT16]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 2 x i64> [[VEC_PHI5]], i64 0
+; CHECK-IC2-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[TMP22]], 1
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP23]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT19]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT18]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 2 x i64> [[TMP18]], i64 0
+; CHECK-IC2-NEXT:    [[TMP25:%.*]] = icmp slt i64 [[TMP24]], 1
+; CHECK-IC2-NEXT:    br i1 [[TMP25]], label %[[LOOP_LATCH_LOOPEXIT11]], label %[[INNER_LOOP4]]
+; CHECK-IC2:       [[LOOP_LATCH_LOOPEXIT11]]:
+; CHECK-IC2-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i64> [[TMP18]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
+; CHECK-IC2-NEXT:    [[PREDPHI14:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> [[TMP19]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]]
+; CHECK-IC2-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[DOTIDX20:%.*]] = shl i64 [[TMP26]], 4
+; CHECK-IC2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[DOTIDX20]]
+; CHECK-IC2-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI14]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC2-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP29:%.*]] = shl i64 [[TMP28]], 1
+; CHECK-IC2-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], [[TMP29]]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_NEXT15]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP30]], i64 [[TMP4]])
+; CHECK-IC2-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC2-NEXT:    br i1 [[TMP31]], label %[[VECTOR_BODY]], label %[[EXIT:.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-IC2:       [[EXIT]]:
+; CHECK-IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+  %a.addr = getelementptr inbounds i64, ptr %A, i64 %i
+  %a.load = load i64, ptr %a.addr, align 8, !llvm.access.group !3
+  %a.is.positive = icmp sgt i64 %a.load, 0
+  br i1 %a.is.positive, label %inner.loop, label %loop.latch
+
+inner.loop:
+  %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+  %a.phi = phi i64 [ %a.next, %inner.loop ], [ 0, %loop.header ]
+  %b.addr = getelementptr inbounds i64, ptr %B, i64 %j
+  %b.load = load i64, ptr %b.addr, align 8, !llvm.access.group !3
+  %a.next = sub i64 %a.phi, %b.load
+  %j.next = add nuw nsw i64 %j, 1
+  %a.is.still.positive = icmp sgt i64 %a.next, 0
+  br i1 %a.is.still.positive, label %inner.loop, label %loop.latch
+
+loop.latch:
+  %a.res = phi i64 [ %a.load, %loop.header ], [ %a.next, %inner.loop ]
+  store i64 %a.res, ptr %a.addr, align 8, !llvm.access.group !3
+  %i.next = add nuw nsw i64 %i, 1
+  %loop.exitcond = icmp eq i64 %i.next, %N
+  br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+;;; Triple-loop nest with the outer-most one beeing vectorized.
+; #pragma clang loop vectorize(enable)
+; for (size_t i = 0; i < N; i++)
+;   for (size_t j = 0; j < M; j++)
+;     for (size_t k = 0; k < L; k++)
+;       A[k][i] += B[i][k];
+define void @quuz(i64 %N, i64 %M, i64 %L, ptr noalias %A, ptr readonly %B) {
+; CHECK-IC1-LABEL: define void @quuz(
+; CHECK-IC1-SAME: i64 [[N:%.*]], i64 [[M:%.*]], i64 [[L:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] {
+; CHECK-IC1-NEXT:  [[ENTRY:.*:]]
+; CHECK-IC1-NEXT:    [[N_IS_ZERO:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-IC1-NEXT:    br i1 [[N_IS_ZERO]], label %[[EXIT:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-IC1:       [[VECTOR_PH]]:
+; CHECK-IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC1-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-IC1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[M]], i64 0
+; CHECK-IC1-NEXT:    [[TMP5:%.*]] = icmp eq <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP6:%.*]] = shufflevector <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP11:%.*]] = xor <vscale x 4 x i1> [[TMP6]], splat (i1 true)
+; CHECK-IC1-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-IC1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP1]], i64 0
+; CHECK-IC1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[L]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-IC1:       [[VECTOR_BODY]]:
+; CHECK-IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT17:.*]] ]
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT17]] ]
+; CHECK-IC1-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP7]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT17]] ]
+; CHECK-IC1-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC1-NEXT:    [[TMP9:%.*]] = mul <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-IC1-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[B]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-IC1-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC1-NEXT:    br label %[[MIDDLE_LOOP3:.*]]
+; CHECK-IC1:       [[MIDDLE_LOOP3]]:
+; CHECK-IC1-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT19:%.*]], %[[MIDDLE_LATCH_LOOPEXIT12:.*]] ]
+; CHECK-IC1-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i1> [ [[TMP12]], %[[VECTOR_BODY]] ], [ [[TMP27:%.*]], %[[MIDDLE_LATCH_LOOPEXIT12]] ]
+; CHECK-IC1-NEXT:    [[TMP13:%.*]] = icmp ne <vscale x 4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[VEC_PHI1]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC1-NEXT:    br label %[[INNER_LOOP5:.*]]
+; CHECK-IC1:       [[INNER_LOOP5]]:
+; CHECK-IC1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[MIDDLE_LOOP3]] ], [ [[BROADCAST_SPLAT10:%.*]], %[[INNER_LOOP5]] ]
+; CHECK-IC1-NEXT:    [[TMP14:%.*]] = phi <vscale x 4 x i1> [ [[TMP24]], %[[MIDDLE_LOOP3]] ], [ [[TMP25:%.*]], %[[INNER_LOOP5]] ]
+; CHECK-IC1-NEXT:    [[K6:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT:    [[TMP15:%.*]] = mul i64 [[K6]], [[N]]
+; CHECK-IC1-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[TMP15]]
+; CHECK-IC1-NEXT:    [[TMP17:%.*]] = getelementptr float, <vscale x 4 x ptr> [[TMP10]], <vscale x 4 x i64> [[VEC_PHI]]
+; CHECK-IC1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP17]], i32 4, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-IC1-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP18]], ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP14]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC1-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC1-NEXT:    [[TMP19:%.*]] = add i64 [[TMP31]], 1
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP19]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT10]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[L]]
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP20]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT10]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP29:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT11]], splat (i1 true)
+; CHECK-IC1-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP29]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP30:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP25]])
+; CHECK-IC1-NEXT:    br i1 [[TMP30]], label %[[INNER_LOOP5]], label %[[MIDDLE_LATCH_LOOPEXIT12]]
+; CHECK-IC1:       [[MIDDLE_LATCH_LOOPEXIT12]]:
+; CHECK-IC1-NEXT:    [[J4:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI2]], i64 0
+; CHECK-IC1-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[J4]], 1
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP21]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT19]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT18]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], [[M]]
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP22]], i64 0
+; CHECK-IC1-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT14]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP26:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT15]], splat (i1 true)
+; CHECK-IC1-NEXT:    [[TMP27]] = select <vscale x 4 x i1> [[VEC_PHI1]], <vscale x 4 x i1> [[TMP26]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC1-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP27]])
+; CHECK-IC1-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_LOOP3]], label %[[OUTER_LATCH_LOOPEXIT17]]
+; CHECK-IC1:       [[OUTER_LATCH_LOOPEXIT17]]:
+; CHECK-IC1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-IC1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-IC1-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-IC1-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC1-NEXT:    br i1 [[TMP23]], label %[[VECTOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-IC1:       [[EXIT]]:
+; CHECK-IC1-NEXT:    ret void
+;
+; CHECK-IC2-LABEL: define void @quuz(
+; CHECK-IC2-SAME: i64 [[N:%.*]], i64 [[M:%.*]], i64 [[L:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] {
+; CHECK-IC2-NEXT:  [[ENTRY:.*:]]
+; CHECK-IC2-NEXT:    [[N_IS_ZERO:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-IC2-NEXT:    br i1 [[N_IS_ZERO]], label %[[EXIT:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-IC2:       [[VECTOR_PH]]:
+; CHECK-IC2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-IC2-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 3
+; CHECK-IC2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 3
+; CHECK-IC2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP4]])
+; CHECK-IC2-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 2
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 [[N]])
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[M]], i64 0
+; CHECK-IC2-NEXT:    [[TMP25:%.*]] = icmp eq <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP9:%.*]] = shufflevector <vscale x 4 x i1> [[TMP25]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP10:%.*]] = xor <vscale x 4 x i1> [[TMP9]], splat (i1 true)
+; CHECK-IC2-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP1]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[L]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-IC2:       [[VECTOR_BODY]]:
+; CHECK-IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT35:.*]] ]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT35]] ]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], %[[OUTER_LATCH_LOOPEXIT35]] ]
+; CHECK-IC2-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP11]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT35]] ]
+; CHECK-IC2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; CHECK-IC2-NEXT:    [[B_INV_GEP:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
+; CHECK-IC2-NEXT:    [[TMP28:%.*]] = mul <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-IC2-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT6]]
+; CHECK-IC2-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr [[B]], <vscale x 4 x i64> [[TMP28]]
+; CHECK-IC2-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[B]], <vscale x 4 x i64> [[TMP14]]
+; CHECK-IC2-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    br label %[[MIDDLE_LOOP7:.*]]
+; CHECK-IC2:       [[MIDDLE_LOOP7]]:
+; CHECK-IC2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT38:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26:.*]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI8:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT40:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI9:%.*]] = phi <vscale x 4 x i1> [ [[TMP17]], %[[VECTOR_BODY]] ], [ [[TMP57:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI10:%.*]] = phi <vscale x 4 x i1> [ [[TMP18]], %[[VECTOR_BODY]] ], [ [[TMP58:%.*]], %[[MIDDLE_LATCH_LOOPEXIT26]] ]
+; CHECK-IC2-NEXT:    [[TMP19:%.*]] = icmp ne <vscale x 4 x i64> [[BROADCAST_SPLAT6]], zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP20:%.*]] = icmp ne <vscale x 4 x i64> [[BROADCAST_SPLAT6]], zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP21:%.*]] = select <vscale x 4 x i1> [[VEC_PHI9]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP22:%.*]] = select <vscale x 4 x i1> [[VEC_PHI10]], <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    br label %[[INNER_LOOP11:.*]]
+; CHECK-IC2:       [[INNER_LOOP11]]:
+; CHECK-IC2-NEXT:    [[VEC_PHI12:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[MIDDLE_LOOP7]] ], [ [[BROADCAST_SPLAT42:%.*]], %[[INNER_LOOP11]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI13:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[MIDDLE_LOOP7]] ], [ [[BROADCAST_SPLAT44:%.*]], %[[INNER_LOOP11]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI14:%.*]] = phi <vscale x 4 x i1> [ [[TMP21]], %[[MIDDLE_LOOP7]] ], [ [[TMP64:%.*]], %[[INNER_LOOP11]] ]
+; CHECK-IC2-NEXT:    [[VEC_PHI15:%.*]] = phi <vscale x 4 x i1> [ [[TMP22]], %[[MIDDLE_LOOP7]] ], [ [[TMP43:%.*]], %[[INNER_LOOP11]] ]
+; CHECK-IC2-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI12]], i64 0
+; CHECK-IC2-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], [[N]]
+; CHECK-IC2-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[B_INV_GEP]], i64 [[TMP24]]
+; CHECK-IC2-NEXT:    [[TMP26:%.*]] = getelementptr float, <vscale x 4 x ptr> [[TMP15]], <vscale x 4 x i64> [[VEC_PHI12]]
+; CHECK-IC2-NEXT:    [[TMP27:%.*]] = getelementptr float, <vscale x 4 x ptr> [[TMP16]], <vscale x 4 x i64> [[VEC_PHI13]]
+; CHECK-IC2-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[DOTIDX7:%.*]] = shl i64 [[TMP13]], 4
+; CHECK-IC2-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[DOTIDX7]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[VEC_PHI14]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[VEC_PHI15]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP26]], i32 4, <vscale x 4 x i1> [[VEC_PHI14]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP27]], i32 4, <vscale x 4 x i1> [[VEC_PHI15]], <vscale x 4 x float> poison), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[TMP30:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-IC2-NEXT:    [[TMP31:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_GATHER13]]
+; CHECK-IC2-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[DOTIDX21:%.*]] = shl i64 [[TMP32]], 4
+; CHECK-IC2-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[DOTIDX21]]
+; CHECK-IC2-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP30]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[VEC_PHI14]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP31]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[VEC_PHI15]]), !llvm.access.group [[ACC_GRP0]]
+; CHECK-IC2-NEXT:    [[TMP34:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI12]], i64 0
+; CHECK-IC2-NEXT:    [[TMP35:%.*]] = add i64 [[TMP34]], 1
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT41:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP35]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT42]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT41]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP36:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI13]], i64 0
+; CHECK-IC2-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], 1
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT43:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT44]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT43]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[TMP35]], [[L]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP38]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT18]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP65:%.*]] = icmp eq i64 [[TMP37]], [[L]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP65]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT21:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT20]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP66:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT19]], splat (i1 true)
+; CHECK-IC2-NEXT:    [[TMP67:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT21]], splat (i1 true)
+; CHECK-IC2-NEXT:    [[TMP64]] = select <vscale x 4 x i1> [[VEC_PHI14]], <vscale x 4 x i1> [[TMP66]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP43]] = select <vscale x 4 x i1> [[VEC_PHI15]], <vscale x 4 x i1> [[TMP67]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP44:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP64]])
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT22:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP44]], i64 0
+; CHECK-IC2-NEXT:    [[TMP45:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP43]])
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT24:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP45]], i64 0
+; CHECK-IC2-NEXT:    [[TMP46:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLATINSERT22]], [[BROADCAST_SPLATINSERT24]]
+; CHECK-IC2-NEXT:    [[TMP47:%.*]] = shufflevector <vscale x 4 x i1> [[TMP46]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP48:%.*]] = extractelement <vscale x 4 x i1> [[TMP47]], i64 0
+; CHECK-IC2-NEXT:    br i1 [[TMP48]], label %[[INNER_LOOP11]], label %[[MIDDLE_LATCH_LOOPEXIT26]]
+; CHECK-IC2:       [[MIDDLE_LATCH_LOOPEXIT26]]:
+; CHECK-IC2-NEXT:    [[TMP49:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI]], i64 0
+; CHECK-IC2-NEXT:    [[TMP50:%.*]] = add nuw nsw i64 [[TMP49]], 1
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT37:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP50]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT38]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT37]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP51:%.*]] = extractelement <vscale x 4 x i64> [[VEC_PHI8]], i64 0
+; CHECK-IC2-NEXT:    [[TMP52:%.*]] = add nuw nsw i64 [[TMP51]], 1
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT39:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP52]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT40]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT39]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP53:%.*]] = icmp eq i64 [[TMP50]], [[M]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP53]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT28:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT27]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[TMP52]], [[M]]
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP54]], i64 0
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLAT30:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT29]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP55:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT28]], splat (i1 true)
+; CHECK-IC2-NEXT:    [[TMP56:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT30]], splat (i1 true)
+; CHECK-IC2-NEXT:    [[TMP57]] = select <vscale x 4 x i1> [[VEC_PHI9]], <vscale x 4 x i1> [[TMP55]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP58]] = select <vscale x 4 x i1> [[VEC_PHI10]], <vscale x 4 x i1> [[TMP56]], <vscale x 4 x i1> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP59:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP57]])
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT31:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP59]], i64 0
+; CHECK-IC2-NEXT:    [[TMP60:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP58]])
+; CHECK-IC2-NEXT:    [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP60]], i64 0
+; CHECK-IC2-NEXT:    [[TMP61:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLATINSERT31]], [[BROADCAST_SPLATINSERT33]]
+; CHECK-IC2-NEXT:    [[TMP62:%.*]] = shufflevector <vscale x 4 x i1> [[TMP61]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-IC2-NEXT:    [[TMP63:%.*]] = extractelement <vscale x 4 x i1> [[TMP62]], i64 0
+; CHECK-IC2-NEXT:    br i1 [[TMP63]], label %[[MIDDLE_LOOP7]], label %[[OUTER_LATCH_LOOPEXIT35]]
+; CHECK-IC2:       [[OUTER_LATCH_LOOPEXIT35]]:
+; CHECK-IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
+; CHECK-IC2-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-IC2-NEXT:    [[TMP40:%.*]] = shl i64 [[TMP39]], 2
+; CHECK-IC2-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]]
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP5]])
+; CHECK-IC2-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP41]], i64 [[TMP5]])
+; CHECK-IC2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT4]]
+; CHECK-IC2-NEXT:    [[TMP42:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-IC2-NEXT:    br i1 [[TMP42]], label %[[VECTOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-IC2:       [[EXIT]]:
+; CHECK-IC2-NEXT:    ret void
+;
+entry:
+  %N.is.zero = icmp eq i64 %N, 0
+  br i1 %N.is.zero, label %exit, label %outer.loop
+
+outer.loop:
+  %i = phi i64 [ %i.next, %outer.latch ], [ 0, %entry ]
+  %a.inv.gep = getelementptr float, ptr %A, i64 %i
+  %i.x.L = mul i64 %i, %L
+  %b.inv.gep = getelementptr float, ptr %B, i64 %i.x.L
+  %M.is.zero = icmp eq i64 %M, 0
+  br i1 %M.is.zero, label %outer.latch, label %middle.loop
+
+middle.loop:
+  %j = phi i64 [ %j.next, %middle.latch ], [ 0, %outer.loop ]
+  %L.is.zero = icmp eq i64 %L, 0
+  br i1 %L.is.zero, label %middle.latch, label %inner.loop
+
+inner.loop:
+  %k = phi i64 [ %k.next, %inner.loop ], [ 0, %middle.loop ]
+  %k.x.N = mul i64 %k, %N
+  %a.gep = getelementptr float, ptr %a.inv.gep, i64 %k.x.N
+  %b.gep = getelementptr float, ptr %b.inv.gep, i64 %k
+  %a.load = load float, ptr %a.gep, align 4, !llvm.access.group !3
+  %b.load = load float, ptr %b.gep, align 4, !llvm.access.group !3
+  %res = fadd float %a.load, %b.load
+  store float %res, ptr %a.gep, align 4, !llvm.access.group !3
+  %k.next = add nuw nsw i64 %k, 1
+  %inner.exitcond = icmp eq i64 %k.next, %L
+  br i1 %inner.exitcond, label %middle.latch, label %inner.loop
+
+middle.latch:
+  %j.next = add nuw nsw i64 %j, 1
+  %middle.exitcond = icmp eq i64 %j.next, %M
+  br i1 %middle.exitcond, label %outer.latch, label %middle.loop
+
+outer.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %outer.exitcond = icmp eq i64 %i.next, %N
+  br i1 %outer.exitcond, label %exit, label %outer.loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = !{!"llvm.loop.parallel_accesses", !3}
+!3 = distinct !{}
+;.
+; CHECK-IC1: [[ACC_GRP0]] = distinct !{}
+; CHECK-IC1: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-IC1: [[META2]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP0]]}
+; CHECK-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META3]], [[META4]]}
+; CHECK-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]], [[META4]]}
+; CHECK-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META3]], [[META4]]}
+;.
+; CHECK-IC2: [[ACC_GRP0]] = distinct !{}
+; CHECK-IC2: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-IC2: [[META2]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP0]]}
+; CHECK-IC2: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-IC2: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-IC2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META3]], [[META4]]}
+; CHECK-IC2: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]], [[META4]]}
+; CHECK-IC2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META3]], [[META4]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll
new file mode 100644
index 0000000000000..46b7bf6f4c7b3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vect-in-classic-path.ll
@@ -0,0 +1,647 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=loop-vectorize,instcombine,simplifycfg -force-vector-width=4 -force-vector-interleave=1 -experimental-olv-in-classic-vect < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+;;; Effectively the inner two loops of:
+; for (size_t i = 0; i < N; i++) {
+;   #pragma clang loop vectorize(enable)
+;   for (size_t j = 0; j < N; j++) {
+;     float a = 0.;
+;     for (size_t k = 0; k < N; k++)
+;       a += B[i][k] * C[k][j];
+;     A[i][j] = a;
+;   }
+; }
+define void @foo(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH9:.*]] ]
+; CHECK-NEXT:    br label %[[INNER_LOOP1:.*]]
+; CHECK:       [[INNER_LOOP1]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT12:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP6:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ splat (i1 true), %[[VECTOR_BODY]] ], [ [[TMP15:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x float> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP7]], [[M]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6]] = fadd <4 x float> [[VEC_PHI]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9]] = select <4 x i1> [[VEC_PHI3]], <4 x float> [[TMP6]], <4 x float> [[VEC_PHI4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[TMP19]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP21]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT12]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT11]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[TMP21]], [[M]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i1> poison, i1 [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT5]], <i1 true, i1 poison, i1 poison, i1 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15]] = select <4 x i1> [[VEC_PHI3]], <4 x i1> [[TMP22]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i1> [[TMP15]] to i4
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i4 [[TMP16]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label %[[LOOP_LATCH9]], label %[[INNER_LOOP1]]
+; CHECK:       [[LOOP_LATCH9]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    br label %[[INNER_LOOP:.*]]
+; CHECK:       [[INNER_LOOP]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[A_PHI:%.*]] = phi float [ [[A_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0.000000e+00, %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[J]]
+; CHECK-NEXT:    [[B_LOAD:%.*]] = load float, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[JXM:%.*]] = mul i64 [[J]], [[M]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[C]], i64 [[JXM]]
+; CHECK-NEXT:    [[C_ADDR:%.*]] = getelementptr float, ptr [[TMP11]], i64 [[I]]
+; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[B_LOAD]], [[C_LOAD]]
+; CHECK-NEXT:    [[A_NEXT]] = fadd float [[A_PHI]], [[MUL]]
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT:    [[INNER_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], [[M]]
+; CHECK-NEXT:    br i1 [[INNER_EXITCOND]], label %[[LOOP_LATCH]], label %[[INNER_LOOP]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    store float [[A_NEXT]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[LOOP_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_EXITCOND]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+  br label %inner.loop
+
+inner.loop:
+  %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+  %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+  %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+  %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3
+  %jxM = mul i64 %j, %M
+  %jxMpi = add i64 %jxM, %i
+  %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+  %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3
+  %mul = fmul float %b.load, %c.load
+  %a.next = fadd float %a.phi, %mul
+  %j.next = add nuw nsw i64 %j, 1
+  %inner.exitcond = icmp eq i64 %j.next, %M
+  br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+  %a.lcssa = phi float [ %a.next, %inner.loop ]
+  %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+  store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3
+  %i.next = add nuw nsw i64 %i, 1
+  %loop.exitcond = icmp eq i64 %i.next, %N
+  br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+;;; Effectively the inner two loops of:
+; for (size_t i = 0; i < N; i++) {
+;   #pragma clang loop vectorize(enable)
+;   for (size_t j = 0; j < N; j++) {
+;     float a = 0.;
+;     for (size_t k = 0; k < j; k++)
+;       a += B[i][k] * C[k][j];
+;     A[i][j] = a;
+;   }
+; }
+;;; Note that the inner loop's trip-count depends on the outer loop.
+define void @bar(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B, ptr readonly %C) {
+; CHECK-LABEL: define void @bar(
+; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_VEC1:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[M]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[N_VEC:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[LOOP_LATCH3:.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[LOOP_LATCH3]] ]
+; CHECK-NEXT:    br label %[[INNER_LOOP1:.*]]
+; CHECK:       [[INNER_LOOP1]]:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT6:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP44:%.*]], %[[INNER_LOOP1]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP5]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i64 3
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP12]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP14]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP15]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP16]], i64 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i64 2
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i64 3
+; CHECK-NEXT:    [[TMP43:%.*]] = fmul <4 x float> [[TMP4]], [[TMP41]]
+; CHECK-NEXT:    [[TMP44]] = fadd <4 x float> [[VEC_PHI3]], [[TMP43]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[TMP25]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX_NEXT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP51]], label %[[LOOP_LATCH3]], label %[[INNER_LOOP1]]
+; CHECK:       [[LOOP_LATCH3]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[N_VEC]]
+; CHECK-NEXT:    store <4 x float> [[TMP44]], ptr [[TMP28]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[N_VEC]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
+; CHECK-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    br label %[[INNER_LOOP:.*]]
+; CHECK:       [[INNER_LOOP]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[A_PHI:%.*]] = phi float [ [[A_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0.000000e+00, %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[J]]
+; CHECK-NEXT:    [[B_LOAD:%.*]] = load float, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[JXM:%.*]] = mul i64 [[J]], [[M]]
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr float, ptr [[C]], i64 [[JXM]]
+; CHECK-NEXT:    [[C_ADDR:%.*]] = getelementptr float, ptr [[TMP52]], i64 [[I]]
+; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[B_LOAD]], [[C_LOAD]]
+; CHECK-NEXT:    [[A_NEXT]] = fadd float [[A_PHI]], [[MUL]]
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT:    [[INNER_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], [[I]]
+; CHECK-NEXT:    br i1 [[INNER_EXITCOND]], label %[[LOOP_LATCH]], label %[[INNER_LOOP]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    store float [[A_NEXT]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[LOOP_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_EXITCOND]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+  br label %inner.loop
+
+inner.loop:
+  %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+  %a.phi = phi float [ %a.next, %inner.loop ], [ 0.0, %loop.header ]
+  %b.addr = getelementptr inbounds float, ptr %B, i64 %j
+  %b.load = load float, ptr %b.addr, align 4, !llvm.access.group !3
+  %jxM = mul i64 %j, %M
+  %jxMpi = add i64 %jxM, %i
+  %c.addr = getelementptr inbounds float, ptr %C, i64 %jxMpi
+  %c.load = load float, ptr %c.addr, align 4, !llvm.access.group !3
+  %mul = fmul float %b.load, %c.load
+  %a.next = fadd float %a.phi, %mul
+  %j.next = add nuw nsw i64 %j, 1
+  %inner.exitcond = icmp eq i64 %j.next, %i
+  br i1 %inner.exitcond, label %loop.latch, label %inner.loop
+
+loop.latch:
+  %a.lcssa = phi float [ %a.next, %inner.loop ]
+  %a.addr = getelementptr inbounds float, ptr %A, i64 %i
+  store float %a.lcssa, ptr %a.addr, align 4, !llvm.access.group !3
+  %i.next = add nuw nsw i64 %i, 1
+  %loop.exitcond = icmp eq i64 %i.next, %N
+  br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+;;; Effectively something like:
+; #pragma clang loop vectorize(enable)
+; for (long i = 0; i < N; i++) {
+;   long a = A[i];
+;   long j = 0;
+;   if (a > 0) {
+;     do {
+;       a -= B[j];
+;       j++;
+;     } while (a > 0);
+;   }
+;   A[i] = a + j;
+; }
+;;; Note that the inner loop is behind a branch, so the start value of the inner
+;;; loop mask phi must be corespondingly. The induction of the inner loop is used
+;;; for a uniform memory accesses and as live-out, so the vectorized code should
+;;; contain two phis for it (one scalar and one widened).
+;;; Also, in this example, the inner loop backedge is the first successor of the
+;;; the latch terminator, not the second one as is assumed by VPlan.
+define void @baz(i64 %N, i64 %M, ptr noalias %A, ptr readonly %B) {
+; CHECK-LABEL: define void @baz(
+; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH_LOOPEXIT9:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    br label %[[INNER_LOOP1:.*]]
+; CHECK:       [[INNER_LOOP1]]:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP20:%.*]], %[[PRED_LOAD_CONTINUE8]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0
+; CHECK-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[A_LOAD:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[A_LOAD]], i64 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i64> [ poison, %[[INNER_LOOP1]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK:       [[PRED_LOAD_IF3]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[TMP9]], i64 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK:       [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x i64> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; CHECK:       [[PRED_LOAD_IF5]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[TMP13]], i64 2
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i64> [ [[TMP11]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP14]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]]
+; CHECK:       [[PRED_LOAD_IF7]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP17]], i64 3
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE8]]
+; CHECK:       [[PRED_LOAD_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i64> [ [[TMP15]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP18]], %[[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[TMP20]] = sub <4 x i64> [[VEC_PHI2]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[TMP21]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP22]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp slt i64 [[TMP23]], 1
+; CHECK-NEXT:    br i1 [[TMP24]], label %[[LOOP_LATCH_LOOPEXIT9]], label %[[INNER_LOOP1]]
+; CHECK:       [[LOOP_LATCH_LOOPEXIT9]]:
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[TMP20]], <4 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[A_ADDR1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[A_LOAD1:%.*]] = load i64, ptr [[A_ADDR1]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[A_IS_POSITIVE:%.*]] = icmp sgt i64 [[A_LOAD1]], 0
+; CHECK-NEXT:    br i1 [[A_IS_POSITIVE]], label %[[INNER_LOOP:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[INNER_LOOP]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[A_PHI:%.*]] = phi i64 [ [[A_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i64, ptr [[B]], i64 [[J]]
+; CHECK-NEXT:    [[B_LOAD:%.*]] = load i64, ptr [[B_ADDR]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[A_NEXT]] = sub i64 [[A_PHI]], [[B_LOAD]]
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT:    [[A_IS_STILL_POSITIVE:%.*]] = icmp sgt i64 [[A_NEXT]], 0
+; CHECK-NEXT:    br i1 [[A_IS_STILL_POSITIVE]], label %[[INNER_LOOP]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[A_RES:%.*]] = phi i64 [ [[A_LOAD1]], %[[LOOP_HEADER]] ], [ [[A_NEXT]], %[[INNER_LOOP]] ]
+; CHECK-NEXT:    store i64 [[A_RES]], ptr [[A_ADDR1]], align 8, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[LOOP_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_EXITCOND]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ %i.next, %loop.latch ], [ 0, %entry ]
+  %a.addr = getelementptr inbounds i64, ptr %A, i64 %i
+  %a.load = load i64, ptr %a.addr, align 8, !llvm.access.group !3
+  %a.is.positive = icmp sgt i64 %a.load, 0
+  br i1 %a.is.positive, label %inner.loop, label %loop.latch
+
+inner.loop:
+  %j = phi i64 [ %j.next, %inner.loop ], [ 0, %loop.header ]
+  %a.phi = phi i64 [ %a.next, %inner.loop ], [ 0, %loop.header ]
+  %b.addr = getelementptr inbounds i64, ptr %B, i64 %j
+  %b.load = load i64, ptr %b.addr, align 8, !llvm.access.group !3
+  %a.next = sub i64 %a.phi, %b.load
+  %j.next = add nuw nsw i64 %j, 1
+  %a.is.still.positive = icmp sgt i64 %a.next, 0
+  br i1 %a.is.still.positive, label %inner.loop, label %loop.latch
+
+loop.latch:
+  %a.res = phi i64 [ %a.load, %loop.header ], [ %a.next, %inner.loop ]
+  store i64 %a.res, ptr %a.addr, align 8, !llvm.access.group !3
+  %i.next = add nuw nsw i64 %i, 1
+  %loop.exitcond = icmp eq i64 %i.next, %N
+  br i1 %loop.exitcond, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+;;; Triple-loop nest with the outer-most one beeing vectorized.
+; #pragma clang loop vectorize(enable)
+; for (size_t i = 0; i < N; i++)
+;   for (size_t j = 0; j < M; j++)
+;     for (size_t k = 0; k < L; k++)
+;       A[k][i] += B[i][k];
+define void @quuz(i64 %N, i64 %M, i64 %L, ptr noalias %A, ptr readonly %B) {
+; CHECK-LABEL: define void @quuz(
+; CHECK-SAME: i64 [[N:%.*]], i64 [[M:%.*]], i64 [[L:%.*]], ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[N_IS_ZERO:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[N_IS_ZERO]], label %[[EXIT:.*]], label %[[OUTER_LOOP_PREHEADER:.*]]
+; CHECK:       [[OUTER_LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[M]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLATINSERT]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[L]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT25:.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[OUTER_LATCH_LOOPEXIT25]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    br label %[[MIDDLE_LOOP3:.*]]
+; CHECK:       [[MIDDLE_LOOP3]]:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[BROADCAST_SPLAT27:%.*]], %[[MIDDLE_LATCH_LOOPEXIT20:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i1> [ [[TMP7]], %[[VECTOR_BODY]] ], [ [[TMP65:%.*]], %[[MIDDLE_LATCH_LOOPEXIT20]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[VEC_PHI4]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    br label %[[INNER_LOOP5:.*]]
+; CHECK:       [[INNER_LOOP5]]:
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ zeroinitializer, %[[MIDDLE_LOOP3]] ], [ [[BROADCAST_SPLAT29:%.*]], %[[PRED_STORE_CONTINUE15:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i1> [ [[TMP4]], %[[MIDDLE_LOOP3]] ], [ [[TMP58:%.*]], %[[PRED_STORE_CONTINUE15]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[VEC_PHI6]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 0
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP28]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP29]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP32]]
+; CHECK-NEXT:    [[INDEX:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[TMP33]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fadd float [[TMP10]], [[TMP15]]
+; CHECK-NEXT:    store float [[TMP16]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 1
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]]
+; CHECK:       [[PRED_STORE_IF10]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = or disjoint i64 [[TMP28]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr float, ptr [[TMP19]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 1
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr float, ptr [[TMP24]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP67:%.*]] = fadd float [[TMP22]], [[TMP27]]
+; CHECK-NEXT:    store float [[TMP67]], ptr [[TMP21]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
+; CHECK:       [[PRED_STORE_CONTINUE11]]:
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 2
+; CHECK-NEXT:    br i1 [[TMP68]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
+; CHECK:       [[PRED_STORE_IF12]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = or disjoint i64 [[TMP28]], 2
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i64> [[TMP5]], i64 2
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr float, ptr [[TMP31]], i64 [[TMP69]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load float, ptr [[TMP70]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 2
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr float, ptr [[TMP36]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fadd float [[TMP34]], [[TMP39]]
+; CHECK-NEXT:    store float [[TMP40]], ptr [[TMP70]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE13]]
+; CHECK:       [[PRED_STORE_CONTINUE13]]:
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i1> [[VEC_PHI7]], i64 3
+; CHECK-NEXT:    br i1 [[TMP41]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15]]
+; CHECK:       [[PRED_STORE_IF14]]:
+; CHECK-NEXT:    [[TMP42:%.*]] = or disjoint i64 [[TMP28]], 3
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i64> [[TMP5]], i64 3
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr float, ptr [[TMP43]], i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = load float, ptr [[TMP45]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 3
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr float, ptr [[TMP48]], i64 [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = load float, ptr [[TMP50]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP52:%.*]] = fadd float [[TMP46]], [[TMP51]]
+; CHECK-NEXT:    store float [[TMP52]], ptr [[TMP45]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE15]]
+; CHECK:       [[PRED_STORE_CONTINUE15]]:
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <4 x i64> [[VEC_PHI6]], i64 0
+; CHECK-NEXT:    [[TMP54:%.*]] = add nuw nsw i64 [[TMP71]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i64> poison, i64 [[TMP54]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT29]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT28]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[TMP54]], [[L]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <4 x i1> poison, i1 [[TMP55]], i64 0
+; CHECK-NEXT:    [[TMP56:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT16]], <i1 true, i1 poison, i1 poison, i1 poison>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <4 x i1> [[TMP56]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP58]] = select <4 x i1> [[VEC_PHI7]], <4 x i1> [[TMP57]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP59:%.*]] = bitcast <4 x i1> [[TMP58]] to i4
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i4 [[TMP59]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label %[[MIDDLE_LATCH_LOOPEXIT20]], label %[[INNER_LOOP5]]
+; CHECK:       [[MIDDLE_LATCH_LOOPEXIT20]]:
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <4 x i64> [[VEC_PHI]], i64 0
+; CHECK-NEXT:    [[TMP61:%.*]] = add nuw nsw i64 [[TMP60]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <4 x i64> poison, i64 [[TMP61]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT27]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT26]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP62:%.*]] = icmp eq i64 [[TMP61]], [[M]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i1> poison, i1 [[TMP62]], i64 0
+; CHECK-NEXT:    [[TMP63:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT21]], <i1 true, i1 poison, i1 poison, i1 poison>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <4 x i1> [[TMP63]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP65]] = select <4 x i1> [[VEC_PHI4]], <4 x i1> [[TMP64]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast <4 x i1> [[TMP65]] to i4
+; CHECK-NEXT:    [[DOTNOT30:%.*]] = icmp eq i4 [[TMP66]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT30]], label %[[OUTER_LATCH_LOOPEXIT25]], label %[[MIDDLE_LOOP3]]
+; CHECK:       [[OUTER_LATCH_LOOPEXIT25]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP28]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP53]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[OUTER_LOOP:.*]]
+; CHECK:       [[OUTER_LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[OUTER_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[A_INV_GEP:%.*]] = getelementptr float, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[I_X_L:%.*]] = mul i64 [[I]], [[L]]
+; CHECK-NEXT:    [[B_INV_GEP:%.*]] = getelementptr float, ptr [[B]], i64 [[I_X_L]]
+; CHECK-NEXT:    [[M_IS_ZERO:%.*]] = icmp eq i64 [[M]], 0
+; CHECK-NEXT:    br i1 [[M_IS_ZERO]], label %[[OUTER_LATCH]], label %[[MIDDLE_LOOP:.*]]
+; CHECK:       [[MIDDLE_LOOP]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[MIDDLE_LATCH:.*]] ], [ 0, %[[OUTER_LOOP]] ]
+; CHECK-NEXT:    [[L_IS_ZERO:%.*]] = icmp eq i64 [[L]], 0
+; CHECK-NEXT:    br i1 [[L_IS_ZERO]], label %[[MIDDLE_LATCH]], label %[[INNER_LOOP:.*]]
+; CHECK:       [[INNER_LOOP]]:
+; CHECK-NEXT:    [[K:%.*]] = phi i64 [ [[K_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[MIDDLE_LOOP]] ]
+; CHECK-NEXT:    [[K_X_N:%.*]] = mul i64 [[K]], [[N]]
+; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr float, ptr [[A_INV_GEP]], i64 [[K_X_N]]
+; CHECK-NEXT:    [[B_GEP:%.*]] = getelementptr float, ptr [[B_INV_GEP]], i64 [[K]]
+; CHECK-NEXT:    [[A_LOAD:%.*]] = load float, ptr [[A_GEP]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[B_LOAD:%.*]] = load float, ptr [[B_GEP]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[RES:%.*]] = fadd float [[A_LOAD]], [[B_LOAD]]
+; CHECK-NEXT:    store float [[RES]], ptr [[A_GEP]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[K_NEXT]] = add nuw nsw i64 [[K]], 1
+; CHECK-NEXT:    [[INNER_EXITCOND:%.*]] = icmp eq i64 [[K_NEXT]], [[L]]
+; CHECK-NEXT:    br i1 [[INNER_EXITCOND]], label %[[MIDDLE_LATCH]], label %[[INNER_LOOP]]
+; CHECK:       [[MIDDLE_LATCH]]:
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT:    [[MIDDLE_EXITCOND:%.*]] = icmp eq i64 [[J_NEXT]], [[M]]
+; CHECK-NEXT:    br i1 [[MIDDLE_EXITCOND]], label %[[OUTER_LATCH]], label %[[MIDDLE_LOOP]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[OUTER_EXITCOND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[OUTER_EXITCOND]], label %[[EXIT]], label %[[OUTER_LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %N.is.zero = icmp eq i64 %N, 0
+  br i1 %N.is.zero, label %exit, label %outer.loop
+
+outer.loop:
+  %i = phi i64 [ %i.next, %outer.latch ], [ 0, %entry ]
+  %a.inv.gep = getelementptr float, ptr %A, i64 %i
+  %i.x.L = mul i64 %i, %L
+  %b.inv.gep = getelementptr float, ptr %B, i64 %i.x.L
+  %M.is.zero = icmp eq i64 %M, 0
+  br i1 %M.is.zero, label %outer.latch, label %middle.loop
+
+middle.loop:
+  %j = phi i64 [ %j.next, %middle.latch ], [ 0, %outer.loop ]
+  %L.is.zero = icmp eq i64 %L, 0
+  br i1 %L.is.zero, label %middle.latch, label %inner.loop
+
+inner.loop:
+  %k = phi i64 [ %k.next, %inner.loop ], [ 0, %middle.loop ]
+  %k.x.N = mul i64 %k, %N
+  %a.gep = getelementptr float, ptr %a.inv.gep, i64 %k.x.N
+  %b.gep = getelementptr float, ptr %b.inv.gep, i64 %k
+  %a.load = load float, ptr %a.gep, align 4, !llvm.access.group !3
+  %b.load = load float, ptr %b.gep, align 4, !llvm.access.group !3
+  %res = fadd float %a.load, %b.load
+  store float %res, ptr %a.gep, align 4, !llvm.access.group !3
+  %k.next = add nuw nsw i64 %k, 1
+  %inner.exitcond = icmp eq i64 %k.next, %L
+  br i1 %inner.exitcond, label %middle.latch, label %inner.loop
+
+middle.latch:
+  %j.next = add nuw nsw i64 %j, 1
+  %middle.exitcond = icmp eq i64 %j.next, %M
+  br i1 %middle.exitcond, label %outer.latch, label %middle.loop
+
+outer.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %outer.exitcond = icmp eq i64 %i.next, %N
+  br i1 %outer.exitcond, label %exit, label %outer.loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = !{!"llvm.loop.parallel_accesses", !3}
+!3 = distinct !{}
+;.
+; CHECK: [[ACC_GRP0]] = distinct !{}
+; CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK: [[META2]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP0]]}
+; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META4]], [[META3]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META3]], [[META4]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META4]], [[META3]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META3]], [[META4]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META4]], [[META3]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META3]], [[META4]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META4]], [[META3]]}
+;.



More information about the llvm-commits mailing list