[llvm] [VPlan] Move predication to VPlanTransform (NFC) (WIP). (PR #128420)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 30 09:22:26 PDT 2025


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/128420

>From 4d6cabdbed0c85b586add3eb932942de11764908 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 1 Mar 2025 21:07:00 +0000
Subject: [PATCH 1/5] [VPlan] Introduce child regions as VPlan transform.

Further simplify VPlan CFG builder by moving introduction of inner
regions to a VPlan transform, building on
https://github.com/llvm/llvm-project/pull/128419

The HCFG builder now only constructs plain CFGs. I will move it to
VPlanConstruction as follow-up.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   6 +-
 .../Vectorize/VPlanConstruction.cpp           |  33 ++++++
 .../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 104 ++----------------
 .../Transforms/Vectorize/VPlanHCFGBuilder.h   |   8 +-
 .../vplan-printing-outer-loop.ll              |  21 ++--
 .../LoopVectorize/vplan_hcfg_stress_test.ll   |   2 +-
 .../Transforms/Vectorize/VPlanTestBase.h      |   2 +-
 7 files changed, 59 insertions(+), 117 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4b4a56be19fe5..a541b4772f868 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9308,10 +9308,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
           Range);
   auto Plan = std::make_unique<VPlan>(OrigLoop);
   // Build hierarchical CFG.
-  // Convert to VPlan-transform and consoliate all transforms for VPlan
+  // TODO: Convert to VPlan-transform and consoliate all transforms for VPlan
   // creation.
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
-  HCFGBuilder.buildHierarchicalCFG();
+  HCFGBuilder.buildPlainCFG();
 
   VPlanTransforms::introduceTopLevelVectorLoopRegion(
       *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
@@ -9615,7 +9615,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
   auto Plan = std::make_unique<VPlan>(OrigLoop);
   // Build hierarchical CFG
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
-  HCFGBuilder.buildHierarchicalCFG();
+  HCFGBuilder.buildPlainCFG();
 
   VPlanTransforms::introduceTopLevelVectorLoopRegion(
       *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index f58f0290b5fa9..ba68ec99b0c74 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -14,12 +14,43 @@
 #include "LoopVectorizationPlanner.h"
 #include "VPlan.h"
 #include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
 #include "VPlanTransforms.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 
 using namespace llvm;
 
+/// Introduce VPRegionBlocks for each loop modeled using a plain CFG in \p Plan.
+static void introduceInnerLoopRegions(VPlan &Plan) {
+  VPDominatorTree VPDT;
+  VPDT.recalculate(Plan);
+
+  for (VPBlockBase *HeaderVPBB :
+       vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())) {
+    if (HeaderVPBB->getNumPredecessors() != 2)
+      continue;
+    VPBlockBase *PreheaderVPBB = HeaderVPBB->getPredecessors()[0];
+    VPBlockBase *LatchVPBB = HeaderVPBB->getPredecessors()[1];
+    if (!VPDT.dominates(HeaderVPBB, LatchVPBB))
+      continue;
+    assert(VPDT.dominates(PreheaderVPBB, HeaderVPBB) &&
+           "preheader must dominate header");
+    VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPBB);
+    VPBlockUtils::disconnectBlocks(LatchVPBB, HeaderVPBB);
+    VPBlockBase *Succ = LatchVPBB->getSingleSuccessor();
+    VPBlockUtils::disconnectBlocks(LatchVPBB, Succ);
+
+    auto *R = Plan.createVPRegionBlock(HeaderVPBB, LatchVPBB, "",
+                                       false /*isReplicator*/);
+    for (VPBlockBase *VPBB : vp_depth_first_shallow(HeaderVPBB))
+      VPBB->setParent(R);
+
+    VPBlockUtils::insertBlockAfter(R, PreheaderVPBB);
+    VPBlockUtils::connectBlocks(R, Succ);
+  }
+}
+
 void VPlanTransforms::introduceTopLevelVectorLoopRegion(
     VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE,
     bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop) {
@@ -98,4 +129,6 @@ void VPlanTransforms::introduceTopLevelVectorLoopRegion(
                                ScalarLatchTerm->getDebugLoc(), "cmp.n");
   Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp},
                        ScalarLatchTerm->getDebugLoc());
+
+  introduceInnerLoopRegions(Plan);
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 4b8a2420b3037..5e31b09bcd7d3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -12,9 +12,7 @@
 /// components and steps:
 //
 /// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that
-/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top
-/// Region) is created to enclose and serve as parent of all the VPBasicBlocks
-/// in the plain CFG.
+/// faithfully represents the CFG in the incoming IR.
 /// NOTE: At this point, there is a direct correspondence between all the
 /// VPBasicBlocks created for the initial plain CFG and the incoming
 /// BasicBlocks. However, this might change in the future.
@@ -57,12 +55,8 @@ class PlainCFGBuilder {
   // Hold phi node's that need to be fixed once the plain CFG has been built.
   SmallVector<PHINode *, 8> PhisToFix;
 
-  /// Maps loops in the original IR to their corresponding region.
-  DenseMap<Loop *, VPRegionBlock *> Loop2Region;
-
   // Utility functions.
   void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
-  void setRegionPredsFromBB(VPRegionBlock *VPBB, BasicBlock *BB);
   void fixHeaderPhis();
   VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
 #ifndef NDEBUG
@@ -83,25 +77,6 @@ class PlainCFGBuilder {
 // Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
 // must have no predecessors.
 void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
-  auto GetLatchOfExit = [this](BasicBlock *BB) -> BasicBlock * {
-    auto *SinglePred = BB->getSinglePredecessor();
-    Loop *LoopForBB = LI->getLoopFor(BB);
-    if (!SinglePred || LI->getLoopFor(SinglePred) == LoopForBB)
-      return nullptr;
-    // The input IR must be in loop-simplify form, ensuring a single predecessor
-    // for exit blocks.
-    assert(SinglePred == LI->getLoopFor(SinglePred)->getLoopLatch() &&
-           "SinglePred must be the only loop latch");
-    return SinglePred;
-  };
-  if (auto *LatchBB = GetLatchOfExit(BB)) {
-    auto *PredRegion = getOrCreateVPBB(LatchBB)->getParent();
-    assert(VPBB == cast<VPBasicBlock>(PredRegion->getSingleSuccessor()) &&
-           "successor must already be set for PredRegion; it must have VPBB "
-           "as single successor");
-    VPBB->setPredecessors({PredRegion});
-    return;
-  }
   // Collect VPBB predecessors.
   SmallVector<VPBlockBase *, 2> VPBBPreds;
   for (BasicBlock *Pred : predecessors(BB))
@@ -113,13 +88,6 @@ static bool isHeaderBB(BasicBlock *BB, Loop *L) {
   return L && BB == L->getHeader();
 }
 
-void PlainCFGBuilder::setRegionPredsFromBB(VPRegionBlock *Region,
-                                           BasicBlock *BB) {
-  // BB is a loop header block. Connect the region to the loop preheader.
-  Loop *LoopOfBB = LI->getLoopFor(BB);
-  Region->setPredecessors({getOrCreateVPBB(LoopOfBB->getLoopPredecessor())});
-}
-
 // Add operands to VPInstructions representing phi nodes from the input IR.
 void PlainCFGBuilder::fixHeaderPhis() {
   for (auto *Phi : PhisToFix) {
@@ -150,19 +118,6 @@ static bool isHeaderVPBB(VPBasicBlock *VPBB) {
   return VPBB->getParent() && VPBB->getParent()->getEntry() == VPBB;
 }
 
-/// Return true of \p L loop is contained within \p OuterLoop.
-static bool doesContainLoop(const Loop *L, const Loop *OuterLoop) {
-  if (L->getLoopDepth() < OuterLoop->getLoopDepth())
-    return false;
-  const Loop *P = L;
-  while (P) {
-    if (P == OuterLoop)
-      return true;
-    P = P->getParentLoop();
-  }
-  return false;
-}
-
 // Create a new empty VPBasicBlock for an incoming BasicBlock in the region
 // corresponding to the containing loop  or retrieve an existing one if it was
 // already created. If no region exists yet for the loop containing \p BB, a new
@@ -178,28 +133,6 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
   LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
   VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name);
   BB2VPBB[BB] = VPBB;
-
-  // Get or create a region for the loop containing BB, except for the top
-  // region of TheLoop which is created later.
-  Loop *LoopOfBB = LI->getLoopFor(BB);
-  if (!LoopOfBB || LoopOfBB == TheLoop || !doesContainLoop(LoopOfBB, TheLoop))
-    return VPBB;
-
-  auto *RegionOfVPBB = Loop2Region.lookup(LoopOfBB);
-  if (!isHeaderBB(BB, LoopOfBB)) {
-    assert(RegionOfVPBB &&
-           "Region should have been created by visiting header earlier");
-    VPBB->setParent(RegionOfVPBB);
-    return VPBB;
-  }
-
-  assert(!RegionOfVPBB &&
-         "First visit of a header basic block expects to register its region.");
-  // Handle a header - take care of its Region.
-  RegionOfVPBB = Plan.createVPRegionBlock(Name.str(), false /*isReplicator*/);
-  RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]);
-  RegionOfVPBB->setEntry(VPBB);
-  Loop2Region[LoopOfBB] = RegionOfVPBB;
   return VPBB;
 }
 
@@ -376,15 +309,13 @@ void PlainCFGBuilder::buildPlainCFG(
   for (BasicBlock *BB : RPO) {
     // Create or retrieve the VPBasicBlock for this BB.
     VPBasicBlock *VPBB = getOrCreateVPBB(BB);
-    VPRegionBlock *Region = VPBB->getParent();
     Loop *LoopForBB = LI->getLoopFor(BB);
     // Set VPBB predecessors in the same order as they are in the incoming BB.
     if (!isHeaderBB(BB, LoopForBB)) {
       setVPBBPredsFromBB(VPBB, BB);
-    } else if (Region) {
-      // BB is a loop header and there's a corresponding region, set the
-      // predecessor for it.
-      setRegionPredsFromBB(Region, BB);
+    } else if (LoopForBB != TheLoop) {
+      VPBB->setPredecessors({getOrCreateVPBB(LoopForBB->getLoopPredecessor()),
+                             getOrCreateVPBB(LoopForBB->getLoopLatch())});
     }
 
     // Create VPInstructions for BB.
@@ -423,21 +354,11 @@ void PlainCFGBuilder::buildPlainCFG(
     BasicBlock *IRSucc1 = BI->getSuccessor(1);
     VPBasicBlock *Successor0 = getOrCreateVPBB(IRSucc0);
     VPBasicBlock *Successor1 = getOrCreateVPBB(IRSucc1);
-    if (BB == LoopForBB->getLoopLatch()) {
-      // For a latch we need to set the successor of the region rather than that
-      // of VPBB and it should be set to the exit, i.e., non-header successor,
-      // except for the top region, which is handled elsewhere.
-      assert(LoopForBB != TheLoop &&
-             "Latch of the top region should have been handled earlier");
-      Region->setOneSuccessor(isHeaderVPBB(Successor0) ? Successor1
-                                                       : Successor0);
-      Region->setExiting(VPBB);
-      continue;
-    }
 
-    // Don't connect any blocks outside the current loop except the latch for
-    // now. The latch is handled above.
-    if (LoopForBB) {
+    // Don't connect any blocks outside the current loop except the latch, which
+    // is handled below.
+    if (LoopForBB &&
+        (LoopForBB == TheLoop || BB != LoopForBB->getLoopLatch())) {
       if (!LoopForBB->contains(IRSucc0)) {
         VPBB->setOneSuccessor(Successor1);
         continue;
@@ -461,16 +382,11 @@ void PlainCFGBuilder::buildPlainCFG(
 
   for (const auto &[IRBB, VPB] : BB2VPBB)
     VPB2IRBB[VPB] = IRBB;
+
+  LLVM_DEBUG(Plan.setName("Plain CFG\n"); dbgs() << Plan);
 }
 
 void VPlanHCFGBuilder::buildPlainCFG() {
   PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
   PCFGBuilder.buildPlainCFG(VPB2IRBB);
 }
-
-// Public interface to build a H-CFG.
-void VPlanHCFGBuilder::buildHierarchicalCFG() {
-  // Build Top Region enclosing the plain CFG.
-  buildPlainCFG();
-  LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index f7f98ed7b1755..f2e90d3f4d9b3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -30,7 +30,6 @@ namespace llvm {
 
 class Loop;
 class LoopInfo;
-class VPRegionBlock;
 class VPlan;
 class VPlanTestIRBase;
 class VPBlockBase;
@@ -54,15 +53,12 @@ class VPlanHCFGBuilder {
   /// created for a input IR basic block.
   DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
 
-  /// Build plain CFG for TheLoop and connects it to Plan's entry.
-  void buildPlainCFG();
-
 public:
   VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
       : TheLoop(Lp), LI(LI), Plan(P) {}
 
-  /// Build H-CFG for TheLoop and update Plan accordingly.
-  void buildHierarchicalCFG();
+  /// Build plain CFG for TheLoop and connects it to Plan's entry.
+  void buildPlainCFG();
 
   /// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if
   /// there is no such corresponding block.
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
index 625a32c098f94..b4b6d3d760349 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
@@ -6,7 +6,7 @@
 @arr = external global [8 x [8 x i64]], align 16
 
 define void @foo(i64 %n) {
-; CHECK:      VPlan 'HCFGBuilder: Plain CFG
+; CHECK:      VPlan 'Plain CFG
 ; CHECK-NEXT: {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
@@ -19,17 +19,14 @@ define void @foo(i64 %n) {
 ; CHECK-NEXT:   EMIT ir<%add> = add ir<%outer.iv>, ir<%n>
 ; CHECK-NEXT: Successor(s): inner
 ; CHECK-EMPTY:
-; CHECK-NEXT: <x1> inner: {
-; CHECK-NEXT:   inner:
-; CHECK-NEXT:     WIDEN-PHI ir<%inner.iv> = phi ir<0>, ir<%inner.iv.next>
-; CHECK-NEXT:     EMIT ir<%gep.2> = getelementptr ir<@arr>, ir<0>, ir<%inner.iv>, ir<%outer.iv>
-; CHECK-NEXT:     EMIT store ir<%add>, ir<%gep.2>
-; CHECK-NEXT:     EMIT ir<%inner.iv.next> = add ir<%inner.iv>, ir<1>
-; CHECK-NEXT:     EMIT ir<%inner.ec> = icmp ir<%inner.iv.next>, ir<8>
-; CHECK-NEXT:     EMIT branch-on-cond ir<%inner.ec>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): outer.latch
+; CHECK-NEXT: inner:
+; CHECK-NEXT:   WIDEN-PHI ir<%inner.iv> = phi ir<0>, ir<%inner.iv.next>
+; CHECK-NEXT:   EMIT ir<%gep.2> = getelementptr ir<@arr>, ir<0>, ir<%inner.iv>, ir<%outer.iv>
+; CHECK-NEXT:   EMIT store ir<%add>, ir<%gep.2>
+; CHECK-NEXT:   EMIT ir<%inner.iv.next> = add ir<%inner.iv>, ir<1>
+; CHECK-NEXT:   EMIT ir<%inner.ec> = icmp ir<%inner.iv.next>, ir<8>
+; CHECK-NEXT:   EMIT branch-on-cond ir<%inner.ec>
+; CHECK-NEXT: Successor(s): outer.latch, inner
 ; CHECK-EMPTY:
 ; CHECK-NEXT: outer.latch:
 ; CHECK-NEXT:   EMIT ir<%outer.iv.next> = add ir<%outer.iv>, ir<1>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll b/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll
index 89eaca0cfa8c8..29aeb7c4e97f9 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll
@@ -4,7 +4,7 @@
 ; Verify that the stress testing flag for the VPlan H-CFG builder works as
 ; expected with and without enabling the VPlan H-CFG Verifier.
 
-; CHECK: VPlan 'HCFGBuilder: Plain CFG
+; CHECK: VPlan 'Plain CFG
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index caf5d2357411d..92961e44c5e54 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -73,7 +73,7 @@ class VPlanTestIRBase : public testing::Test {
     PredicatedScalarEvolution PSE(*SE, *L);
     auto Plan = std::make_unique<VPlan>(L);
     VPlanHCFGBuilder HCFGBuilder(L, LI.get(), *Plan);
-    HCFGBuilder.buildHierarchicalCFG();
+    HCFGBuilder.buildPlainCFG();
     VPlanTransforms::introduceTopLevelVectorLoopRegion(
         *Plan, IntegerType::get(*Ctx, 64), PSE, true, false, L);
     return Plan;

>From 1bb836a6f937221b4880f071eb4b7fdf165f5ddc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 23 Mar 2025 11:36:53 +0000
Subject: [PATCH 2/5] [VPlan] Move plain CFG construction to VPlanConstruction.
 (NFC)

---
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 -
 .../Transforms/Vectorize/LoopVectorize.cpp    |  23 +-
 .../Vectorize/VPlanConstruction.cpp           | 369 +++++++++++++++++
 .../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 392 ------------------
 .../Transforms/Vectorize/VPlanHCFGBuilder.h   |  73 ----
 .../Transforms/Vectorize/VPlanTransforms.h    |   4 +
 .../Transforms/Vectorize/VPlanSlpTest.cpp     |   1 -
 .../Transforms/Vectorize/VPlanTestBase.h      |   6 +-
 8 files changed, 381 insertions(+), 488 deletions(-)
 delete mode 100644 llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
 delete mode 100644 llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h

diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 7dac6d0059b26..0dc6a7d2f594f 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -24,7 +24,6 @@ add_llvm_component_library(LLVMVectorize
   VPlan.cpp
   VPlanAnalysis.cpp
   VPlanConstruction.cpp
-  VPlanHCFGBuilder.cpp
   VPlanRecipes.cpp
   VPlanSLP.cpp
   VPlanTransforms.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a541b4772f868..462dac7259437 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -59,7 +59,6 @@
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanCFG.h"
-#include "VPlanHCFGBuilder.h"
 #include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
@@ -9306,13 +9305,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
             return !CM.requiresScalarEpilogue(VF.isVector());
           },
           Range);
-  auto Plan = std::make_unique<VPlan>(OrigLoop);
-  // Build hierarchical CFG.
-  // TODO: Convert to VPlan-transform and consoliate all transforms for VPlan
-  // creation.
-  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
-  HCFGBuilder.buildPlainCFG();
-
+  DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
+  auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
   VPlanTransforms::introduceTopLevelVectorLoopRegion(
       *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
       CM.foldTailByMasking(), OrigLoop);
@@ -9391,7 +9385,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
     // Handle VPBBs down to the latch.
     if (VPBB == LoopRegion->getExiting()) {
-      assert(!HCFGBuilder.getIRBBForVPB(VPBB) &&
+      assert(!VPB2IRBB.contains(VPBB) &&
              "the latch block shouldn't have a corresponding IRBB");
       VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
       break;
@@ -9407,7 +9401,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       // FIXME: At the moment, masks need to be placed at the beginning of the
       // block, as blends introduced for phi nodes need to use it. The created
       // blends should be sunk after the mask recipes.
-      RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB));
+      RecipeBuilder.createBlockInMask(VPB2IRBB.lookup(VPBB));
     }
 
     // Convert input VPInstructions to widened recipes.
@@ -9610,13 +9604,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
   // the vectorization pipeline.
   assert(!OrigLoop->isInnermost());
   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
-
-  // Create new empty VPlan
-  auto Plan = std::make_unique<VPlan>(OrigLoop);
-  // Build hierarchical CFG
-  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
-  HCFGBuilder.buildPlainCFG();
-
+  DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
+  auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
   VPlanTransforms::introduceTopLevelVectorLoopRegion(
       *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index ba68ec99b0c74..4303375e33ed2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -17,10 +17,379 @@
 #include "VPlanDominatorTree.h"
 #include "VPlanTransforms.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 
+#define DEBUG_TYPE "loop-vectorize"
+
 using namespace llvm;
 
+namespace {
+/// Class that is used to build the plain CFG for the incoming IR.
+class PlainCFGBuilder {
+  // The outermost loop of the input loop nest considered for vectorization.
+  Loop *TheLoop;
+
+  // Loop Info analysis.
+  LoopInfo &LI;
+
+  // Vectorization plan that we are working on.
+  VPlan &Plan;
+
+  // Builder of the VPlan instruction-level representation.
+  VPBuilder VPIRBuilder;
+
+  // NOTE: The following maps are intentionally destroyed after the plain CFG
+  // construction because subsequent VPlan-to-VPlan transformation may
+  // invalidate them.
+  // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
+  DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
+  // Map incoming Value definitions to their newly-created VPValues.
+  DenseMap<Value *, VPValue *> IRDef2VPValue;
+
+  // Hold phi node's that need to be fixed once the plain CFG has been built.
+  SmallVector<PHINode *, 8> PhisToFix;
+
+  // Utility functions.
+  void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
+  void fixHeaderPhis();
+  VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
+#ifndef NDEBUG
+  bool isExternalDef(Value *Val);
+#endif
+  VPValue *getOrCreateVPOperand(Value *IRVal);
+  void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
+
+public:
+  PlainCFGBuilder(Loop *Lp, LoopInfo &LI, VPlan &P)
+      : TheLoop(Lp), LI(LI), Plan(P) {}
+
+  /// Build plain CFG for TheLoop  and connects it to Plan's entry.
+  void buildPlainCFG(DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
+};
+} // anonymous namespace
+
+// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
+// must have no predecessors.
+void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
+  // Collect VPBB predecessors.
+  SmallVector<VPBlockBase *, 2> VPBBPreds;
+  for (BasicBlock *Pred : predecessors(BB))
+    VPBBPreds.push_back(getOrCreateVPBB(Pred));
+  VPBB->setPredecessors(VPBBPreds);
+}
+
+static bool isHeaderBB(BasicBlock *BB, Loop *L) {
+  return L && BB == L->getHeader();
+}
+
+// Add operands to VPInstructions representing phi nodes from the input IR.
+void PlainCFGBuilder::fixHeaderPhis() {
+  for (auto *Phi : PhisToFix) {
+    assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
+    VPValue *VPVal = IRDef2VPValue[Phi];
+    assert(isa<VPWidenPHIRecipe>(VPVal) &&
+           "Expected WidenPHIRecipe for phi node.");
+    auto *VPPhi = cast<VPWidenPHIRecipe>(VPVal);
+    assert(VPPhi->getNumOperands() == 0 &&
+           "Expected VPInstruction with no operands.");
+
+    Loop *L = LI.getLoopFor(Phi->getParent());
+    assert(isHeaderBB(Phi->getParent(), L));
+    // For header phis, make sure the incoming value from the loop
+    // predecessor is the first operand of the recipe.
+    assert(Phi->getNumOperands() == 2 &&
+           "header phi must have exactly 2 operands");
+    BasicBlock *LoopPred = L->getLoopPredecessor();
+    VPPhi->addOperand(
+        getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred)));
+    BasicBlock *LoopLatch = L->getLoopLatch();
+    VPPhi->addOperand(
+        getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopLatch)));
+  }
+}
+
+static bool isHeaderVPBB(VPBasicBlock *VPBB) {
+  return VPBB->getParent() && VPBB->getParent()->getEntry() == VPBB;
+}
+
+// Create a new empty VPBasicBlock for an incoming BasicBlock in the region
+// corresponding to the containing loop  or retrieve an existing one if it was
+// already created. If no region exists yet for the loop containing \p BB, a new
+// one is created.
+VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
+  if (auto *VPBB = BB2VPBB.lookup(BB)) {
+    // Retrieve existing VPBB.
+    return VPBB;
+  }
+
+  // Create new VPBB.
+  StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName();
+  LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
+  VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name);
+  BB2VPBB[BB] = VPBB;
+  return VPBB;
+}
+
+#ifndef NDEBUG
+// Return true if \p Val is considered an external definition. An external
+// definition is either:
+// 1. A Value that is not an Instruction. This will be refined in the future.
+// 2. An Instruction that is outside of the CFG snippet represented in VPlan,
+// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c)
+// outermost loop exits.
+bool PlainCFGBuilder::isExternalDef(Value *Val) {
+  // All the Values that are not Instructions are considered external
+  // definitions for now.
+  Instruction *Inst = dyn_cast<Instruction>(Val);
+  if (!Inst)
+    return true;
+
+  BasicBlock *InstParent = Inst->getParent();
+  assert(InstParent && "Expected instruction parent.");
+
+  // Check whether Instruction definition is in loop PH.
+  BasicBlock *PH = TheLoop->getLoopPreheader();
+  assert(PH && "Expected loop pre-header.");
+
+  if (InstParent == PH)
+    // Instruction definition is in outermost loop PH.
+    return false;
+
+  // Check whether Instruction definition is in a loop exit.
+  SmallVector<BasicBlock *> ExitBlocks;
+  TheLoop->getExitBlocks(ExitBlocks);
+  if (is_contained(ExitBlocks, InstParent)) {
+    // Instruction definition is in outermost loop exit.
+    return false;
+  }
+
+  // Check whether Instruction definition is in loop body.
+  return !TheLoop->contains(Inst);
+}
+#endif
+
+// Create a new VPValue or retrieve an existing one for the Instruction's
+// operand \p IRVal. This function must only be used to create/retrieve VPValues
+// for *Instruction's operands* and not to create regular VPInstruction's. For
+// the latter, please, look at 'createVPInstructionsForVPBB'.
+VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
+  auto VPValIt = IRDef2VPValue.find(IRVal);
+  if (VPValIt != IRDef2VPValue.end())
+    // Operand has an associated VPInstruction or VPValue that was previously
+    // created.
+    return VPValIt->second;
+
+  // Operand doesn't have a previously created VPInstruction/VPValue. This
+  // means that operand is:
+  //   A) a definition external to VPlan,
+  //   B) any other Value without specific representation in VPlan.
+  // For now, we use VPValue to represent A and B and classify both as external
+  // definitions. We may introduce specific VPValue subclasses for them in the
+  // future.
+  assert(isExternalDef(IRVal) && "Expected external definition as operand.");
+
+  // A and B: Create VPValue and add it to the pool of external definitions and
+  // to the Value->VPValue map.
+  VPValue *NewVPVal = Plan.getOrAddLiveIn(IRVal);
+  IRDef2VPValue[IRVal] = NewVPVal;
+  return NewVPVal;
+}
+
+// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
+// counterpart. This function must be invoked in RPO so that the operands of a
+// VPInstruction in \p BB have been visited before (except for Phi nodes).
+void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
+                                                  BasicBlock *BB) {
+  VPIRBuilder.setInsertPoint(VPBB);
+  // TODO: Model and preserve debug intrinsics in VPlan.
+  for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) {
+    Instruction *Inst = &InstRef;
+
+    // There shouldn't be any VPValue for Inst at this point. Otherwise, we
+    // visited Inst when we shouldn't, breaking the RPO traversal order.
+    assert(!IRDef2VPValue.count(Inst) &&
+           "Instruction shouldn't have been visited.");
+
+    if (auto *Br = dyn_cast<BranchInst>(Inst)) {
+      if (TheLoop->getLoopLatch() == BB ||
+          any_of(successors(BB),
+                 [this](BasicBlock *Succ) { return !TheLoop->contains(Succ); }))
+        continue;
+
+      // Conditional branch instruction are represented using BranchOnCond
+      // recipes.
+      if (Br->isConditional()) {
+        VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
+        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst);
+      }
+
+      // Skip the rest of the Instruction processing for Branch instructions.
+      continue;
+    }
+
+    if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
+      SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
+      for (auto Case : SI->cases())
+        Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
+      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
+      continue;
+    }
+
+    VPSingleDefRecipe *NewR;
+    if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+      // Phi node's operands may have not been visited at this point. We create
+      // an empty VPInstruction that we will fix once the whole plain CFG has
+      // been built.
+      NewR = new VPWidenPHIRecipe(Phi, nullptr, Phi->getDebugLoc(), "vec.phi");
+      VPBB->appendRecipe(NewR);
+      if (isHeaderBB(Phi->getParent(), LI.getLoopFor(Phi->getParent()))) {
+        // Header phis need to be fixed after the VPBB for the latch has been
+        // created.
+        PhisToFix.push_back(Phi);
+      } else {
+        // Add operands for VPPhi in the order matching its predecessors in
+        // VPlan.
+        DenseMap<const VPBasicBlock *, VPValue *> VPPredToIncomingValue;
+        for (unsigned I = 0; I != Phi->getNumOperands(); ++I) {
+          VPPredToIncomingValue[BB2VPBB[Phi->getIncomingBlock(I)]] =
+              getOrCreateVPOperand(Phi->getIncomingValue(I));
+        }
+        for (VPBlockBase *Pred : VPBB->getPredecessors())
+          NewR->addOperand(
+              VPPredToIncomingValue.lookup(Pred->getExitingBasicBlock()));
+      }
+    } else {
+      // Translate LLVM-IR operands into VPValue operands and set them in the
+      // new VPInstruction.
+      SmallVector<VPValue *, 4> VPOperands;
+      for (Value *Op : Inst->operands())
+        VPOperands.push_back(getOrCreateVPOperand(Op));
+
+      // Build VPInstruction for any arbitrary Instruction without specific
+      // representation in VPlan.
+      NewR = cast<VPInstruction>(
+          VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+    }
+
+    IRDef2VPValue[Inst] = NewR;
+  }
+}
+
+// Main interface to build the plain CFG.
+void PlainCFGBuilder::buildPlainCFG(
+    DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
+
+  // 1. Scan the body of the loop in a topological order to visit each basic
+  // block after having visited its predecessor basic blocks. Create a VPBB for
+  // each BB and link it to its successor and predecessor VPBBs. Note that
+  // predecessors must be set in the same order as they are in the incomming IR.
+  // Otherwise, there might be problems with existing phi nodes and algorithm
+  // based on predecessors traversal.
+
+  // Loop PH needs to be explicitly visited since it's not taken into account by
+  // LoopBlocksDFS.
+  BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();
+  assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
+         "Unexpected loop preheader");
+  for (auto &I : *ThePreheaderBB) {
+    if (I.getType()->isVoidTy())
+      continue;
+    IRDef2VPValue[&I] = Plan.getOrAddLiveIn(&I);
+  }
+
+  LoopBlocksRPO RPO(TheLoop);
+  RPO.perform(&LI);
+
+  for (BasicBlock *BB : RPO) {
+    // Create or retrieve the VPBasicBlock for this BB.
+    VPBasicBlock *VPBB = getOrCreateVPBB(BB);
+    Loop *LoopForBB = LI.getLoopFor(BB);
+    // Set VPBB predecessors in the same order as they are in the incoming BB.
+    if (!isHeaderBB(BB, LoopForBB)) {
+      setVPBBPredsFromBB(VPBB, BB);
+    } else if (LoopForBB != TheLoop) {
+      VPBB->setPredecessors({getOrCreateVPBB(LoopForBB->getLoopPredecessor()),
+                             getOrCreateVPBB(LoopForBB->getLoopLatch())});
+    }
+
+    // Create VPInstructions for BB.
+    createVPInstructionsForVPBB(VPBB, BB);
+
+    if (BB == TheLoop->getLoopLatch()) {
+      VPBasicBlock *HeaderVPBB = getOrCreateVPBB(LoopForBB->getHeader());
+      VPBlockUtils::connectBlocks(VPBB, HeaderVPBB);
+      continue;
+    }
+
+    // Set VPBB successors. We create empty VPBBs for successors if they don't
+    // exist already. Recipes will be created when the successor is visited
+    // during the RPO traversal.
+    if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      SmallVector<VPBlockBase *> Succs = {
+          getOrCreateVPBB(SI->getDefaultDest())};
+      for (auto Case : SI->cases())
+        Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor()));
+      VPBB->setSuccessors(Succs);
+      continue;
+    }
+    auto *BI = cast<BranchInst>(BB->getTerminator());
+    unsigned NumSuccs = succ_size(BB);
+    if (NumSuccs == 1) {
+      auto *Successor = getOrCreateVPBB(BB->getSingleSuccessor());
+      VPBB->setOneSuccessor(isHeaderVPBB(Successor)
+                                ? Successor->getParent()
+                                : static_cast<VPBlockBase *>(Successor));
+      continue;
+    }
+    assert(BI->isConditional() && NumSuccs == 2 && BI->isConditional() &&
+           "block must have conditional branch with 2 successors");
+
+    BasicBlock *IRSucc0 = BI->getSuccessor(0);
+    BasicBlock *IRSucc1 = BI->getSuccessor(1);
+    VPBasicBlock *Successor0 = getOrCreateVPBB(IRSucc0);
+    VPBasicBlock *Successor1 = getOrCreateVPBB(IRSucc1);
+
+    // Don't connect any blocks outside the current loop except the latch, which
+    // is handled below.
+    if (LoopForBB &&
+        (LoopForBB == TheLoop || BB != LoopForBB->getLoopLatch())) {
+      if (!LoopForBB->contains(IRSucc0)) {
+        VPBB->setOneSuccessor(Successor1);
+        continue;
+      }
+      if (!LoopForBB->contains(IRSucc1)) {
+        VPBB->setOneSuccessor(Successor0);
+        continue;
+      }
+    }
+
+    VPBB->setTwoSuccessors(Successor0, Successor1);
+  }
+
+  // 2. The whole CFG has been built at this point so all the input Values must
+  // have a VPlan counterpart. Fix VPlan header phi by adding their
+  // corresponding VPlan operands.
+  fixHeaderPhis();
+
+  VPBlockUtils::connectBlocks(Plan.getEntry(),
+                              getOrCreateVPBB(TheLoop->getHeader()));
+
+  for (const auto &[IRBB, VPB] : BB2VPBB)
+    VPB2IRBB[VPB] = IRBB;
+
+  LLVM_DEBUG(Plan.setName("Plain CFG\n"); dbgs() << Plan);
+}
+
+std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(
+    Loop *TheLoop, LoopInfo &LI,
+    DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
+  auto Plan = std::make_unique<VPlan>(TheLoop);
+  PlainCFGBuilder Builder(TheLoop, LI, *Plan);
+  Builder.buildPlainCFG(VPB2IRBB);
+  return Plan;
+}
+
 /// Introduce VPRegionBlocks for each loop modeled using a plain CFG in \p Plan.
 static void introduceInnerLoopRegions(VPlan &Plan) {
   VPDominatorTree VPDT;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
deleted file mode 100644
index 5e31b09bcd7d3..0000000000000
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ /dev/null
@@ -1,392 +0,0 @@
-//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements the construction of a VPlan-based Hierarchical CFG
-/// (H-CFG) for an incoming IR. This construction comprises the following
-/// components and steps:
-//
-/// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that
-/// faithfully represents the CFG in the incoming IR.
-/// NOTE: At this point, there is a direct correspondence between all the
-/// VPBasicBlocks created for the initial plain CFG and the incoming
-/// BasicBlocks. However, this might change in the future.
-///
-//===----------------------------------------------------------------------===//
-
-#include "VPlanHCFGBuilder.h"
-#include "LoopVectorizationPlanner.h"
-#include "VPlanCFG.h"
-#include "llvm/Analysis/LoopIterator.h"
-
-#define DEBUG_TYPE "loop-vectorize"
-
-using namespace llvm;
-
-namespace {
-// Class that is used to build the plain CFG for the incoming IR.
-class PlainCFGBuilder {
-private:
-  // The outermost loop of the input loop nest considered for vectorization.
-  Loop *TheLoop;
-
-  // Loop Info analysis.
-  LoopInfo *LI;
-
-  // Vectorization plan that we are working on.
-  VPlan &Plan;
-
-  // Builder of the VPlan instruction-level representation.
-  VPBuilder VPIRBuilder;
-
-  // NOTE: The following maps are intentionally destroyed after the plain CFG
-  // construction because subsequent VPlan-to-VPlan transformation may
-  // invalidate them.
-  // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
-  DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
-  // Map incoming Value definitions to their newly-created VPValues.
-  DenseMap<Value *, VPValue *> IRDef2VPValue;
-
-  // Hold phi node's that need to be fixed once the plain CFG has been built.
-  SmallVector<PHINode *, 8> PhisToFix;
-
-  // Utility functions.
-  void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
-  void fixHeaderPhis();
-  VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
-#ifndef NDEBUG
-  bool isExternalDef(Value *Val);
-#endif
-  VPValue *getOrCreateVPOperand(Value *IRVal);
-  void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
-
-public:
-  PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
-      : TheLoop(Lp), LI(LI), Plan(P) {}
-
-  /// Build plain CFG for TheLoop  and connects it to Plan's entry.
-  void buildPlainCFG(DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
-};
-} // anonymous namespace
-
-// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
-// must have no predecessors.
-void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
-  // Collect VPBB predecessors.
-  SmallVector<VPBlockBase *, 2> VPBBPreds;
-  for (BasicBlock *Pred : predecessors(BB))
-    VPBBPreds.push_back(getOrCreateVPBB(Pred));
-  VPBB->setPredecessors(VPBBPreds);
-}
-
-static bool isHeaderBB(BasicBlock *BB, Loop *L) {
-  return L && BB == L->getHeader();
-}
-
-// Add operands to VPInstructions representing phi nodes from the input IR.
-void PlainCFGBuilder::fixHeaderPhis() {
-  for (auto *Phi : PhisToFix) {
-    assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
-    VPValue *VPVal = IRDef2VPValue[Phi];
-    assert(isa<VPWidenPHIRecipe>(VPVal) &&
-           "Expected WidenPHIRecipe for phi node.");
-    auto *VPPhi = cast<VPWidenPHIRecipe>(VPVal);
-    assert(VPPhi->getNumOperands() == 0 &&
-           "Expected VPInstruction with no operands.");
-
-    Loop *L = LI->getLoopFor(Phi->getParent());
-    assert(isHeaderBB(Phi->getParent(), L));
-    // For header phis, make sure the incoming value from the loop
-    // predecessor is the first operand of the recipe.
-    assert(Phi->getNumOperands() == 2 &&
-           "header phi must have exactly 2 operands");
-    BasicBlock *LoopPred = L->getLoopPredecessor();
-    VPPhi->addOperand(
-        getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred)));
-    BasicBlock *LoopLatch = L->getLoopLatch();
-    VPPhi->addOperand(
-        getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopLatch)));
-  }
-}
-
-static bool isHeaderVPBB(VPBasicBlock *VPBB) {
-  return VPBB->getParent() && VPBB->getParent()->getEntry() == VPBB;
-}
-
-// Create a new empty VPBasicBlock for an incoming BasicBlock in the region
-// corresponding to the containing loop  or retrieve an existing one if it was
-// already created. If no region exists yet for the loop containing \p BB, a new
-// one is created.
-VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
-  if (auto *VPBB = BB2VPBB.lookup(BB)) {
-    // Retrieve existing VPBB.
-    return VPBB;
-  }
-
-  // Create new VPBB.
-  StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName();
-  LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
-  VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name);
-  BB2VPBB[BB] = VPBB;
-  return VPBB;
-}
-
-#ifndef NDEBUG
-// Return true if \p Val is considered an external definition. An external
-// definition is either:
-// 1. A Value that is not an Instruction. This will be refined in the future.
-// 2. An Instruction that is outside of the CFG snippet represented in VPlan,
-// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c)
-// outermost loop exits.
-bool PlainCFGBuilder::isExternalDef(Value *Val) {
-  // All the Values that are not Instructions are considered external
-  // definitions for now.
-  Instruction *Inst = dyn_cast<Instruction>(Val);
-  if (!Inst)
-    return true;
-
-  BasicBlock *InstParent = Inst->getParent();
-  assert(InstParent && "Expected instruction parent.");
-
-  // Check whether Instruction definition is in loop PH.
-  BasicBlock *PH = TheLoop->getLoopPreheader();
-  assert(PH && "Expected loop pre-header.");
-
-  if (InstParent == PH)
-    // Instruction definition is in outermost loop PH.
-    return false;
-
-  // Check whether Instruction definition is in a loop exit.
-  SmallVector<BasicBlock *> ExitBlocks;
-  TheLoop->getExitBlocks(ExitBlocks);
-  if (is_contained(ExitBlocks, InstParent)) {
-    // Instruction definition is in outermost loop exit.
-    return false;
-  }
-
-  // Check whether Instruction definition is in loop body.
-  return !TheLoop->contains(Inst);
-}
-#endif
-
-// Create a new VPValue or retrieve an existing one for the Instruction's
-// operand \p IRVal. This function must only be used to create/retrieve VPValues
-// for *Instruction's operands* and not to create regular VPInstruction's. For
-// the latter, please, look at 'createVPInstructionsForVPBB'.
-VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
-  auto VPValIt = IRDef2VPValue.find(IRVal);
-  if (VPValIt != IRDef2VPValue.end())
-    // Operand has an associated VPInstruction or VPValue that was previously
-    // created.
-    return VPValIt->second;
-
-  // Operand doesn't have a previously created VPInstruction/VPValue. This
-  // means that operand is:
-  //   A) a definition external to VPlan,
-  //   B) any other Value without specific representation in VPlan.
-  // For now, we use VPValue to represent A and B and classify both as external
-  // definitions. We may introduce specific VPValue subclasses for them in the
-  // future.
-  assert(isExternalDef(IRVal) && "Expected external definition as operand.");
-
-  // A and B: Create VPValue and add it to the pool of external definitions and
-  // to the Value->VPValue map.
-  VPValue *NewVPVal = Plan.getOrAddLiveIn(IRVal);
-  IRDef2VPValue[IRVal] = NewVPVal;
-  return NewVPVal;
-}
-
-// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
-// counterpart. This function must be invoked in RPO so that the operands of a
-// VPInstruction in \p BB have been visited before (except for Phi nodes).
-void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
-                                                  BasicBlock *BB) {
-  VPIRBuilder.setInsertPoint(VPBB);
-  // TODO: Model and preserve debug intrinsics in VPlan.
-  for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) {
-    Instruction *Inst = &InstRef;
-
-    // There shouldn't be any VPValue for Inst at this point. Otherwise, we
-    // visited Inst when we shouldn't, breaking the RPO traversal order.
-    assert(!IRDef2VPValue.count(Inst) &&
-           "Instruction shouldn't have been visited.");
-
-    if (auto *Br = dyn_cast<BranchInst>(Inst)) {
-      if (TheLoop->getLoopLatch() == BB ||
-          any_of(successors(BB),
-                 [this](BasicBlock *Succ) { return !TheLoop->contains(Succ); }))
-        continue;
-
-      // Conditional branch instruction are represented using BranchOnCond
-      // recipes.
-      if (Br->isConditional()) {
-        VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
-        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst);
-      }
-
-      // Skip the rest of the Instruction processing for Branch instructions.
-      continue;
-    }
-
-    if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
-      SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
-      for (auto Case : SI->cases())
-        Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
-      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
-      continue;
-    }
-
-    VPSingleDefRecipe *NewR;
-    if (auto *Phi = dyn_cast<PHINode>(Inst)) {
-      // Phi node's operands may have not been visited at this point. We create
-      // an empty VPInstruction that we will fix once the whole plain CFG has
-      // been built.
-      NewR = new VPWidenPHIRecipe(Phi, nullptr, Phi->getDebugLoc(), "vec.phi");
-      VPBB->appendRecipe(NewR);
-      if (isHeaderBB(Phi->getParent(), LI->getLoopFor(Phi->getParent()))) {
-        // Header phis need to be fixed after the VPBB for the latch has been
-        // created.
-        PhisToFix.push_back(Phi);
-      } else {
-        // Add operands for VPPhi in the order matching its predecessors in
-        // VPlan.
-        DenseMap<const VPBasicBlock *, VPValue *> VPPredToIncomingValue;
-        for (unsigned I = 0; I != Phi->getNumOperands(); ++I) {
-          VPPredToIncomingValue[BB2VPBB[Phi->getIncomingBlock(I)]] =
-              getOrCreateVPOperand(Phi->getIncomingValue(I));
-        }
-        for (VPBlockBase *Pred : VPBB->getPredecessors())
-          NewR->addOperand(
-              VPPredToIncomingValue.lookup(Pred->getExitingBasicBlock()));
-      }
-    } else {
-      // Translate LLVM-IR operands into VPValue operands and set them in the
-      // new VPInstruction.
-      SmallVector<VPValue *, 4> VPOperands;
-      for (Value *Op : Inst->operands())
-        VPOperands.push_back(getOrCreateVPOperand(Op));
-
-      // Build VPInstruction for any arbitrary Instruction without specific
-      // representation in VPlan.
-      NewR = cast<VPInstruction>(
-          VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
-    }
-
-    IRDef2VPValue[Inst] = NewR;
-  }
-}
-
-// Main interface to build the plain CFG.
-void PlainCFGBuilder::buildPlainCFG(
-    DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
-
-  // 1. Scan the body of the loop in a topological order to visit each basic
-  // block after having visited its predecessor basic blocks. Create a VPBB for
-  // each BB and link it to its successor and predecessor VPBBs. Note that
-  // predecessors must be set in the same order as they are in the incomming IR.
-  // Otherwise, there might be problems with existing phi nodes and algorithm
-  // based on predecessors traversal.
-
-  // Loop PH needs to be explicitly visited since it's not taken into account by
-  // LoopBlocksDFS.
-  BasicBlock *ThePreheaderBB = TheLoop->getLoopPreheader();
-  assert((ThePreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
-         "Unexpected loop preheader");
-  for (auto &I : *ThePreheaderBB) {
-    if (I.getType()->isVoidTy())
-      continue;
-    IRDef2VPValue[&I] = Plan.getOrAddLiveIn(&I);
-  }
-
-  LoopBlocksRPO RPO(TheLoop);
-  RPO.perform(LI);
-
-  for (BasicBlock *BB : RPO) {
-    // Create or retrieve the VPBasicBlock for this BB.
-    VPBasicBlock *VPBB = getOrCreateVPBB(BB);
-    Loop *LoopForBB = LI->getLoopFor(BB);
-    // Set VPBB predecessors in the same order as they are in the incoming BB.
-    if (!isHeaderBB(BB, LoopForBB)) {
-      setVPBBPredsFromBB(VPBB, BB);
-    } else if (LoopForBB != TheLoop) {
-      VPBB->setPredecessors({getOrCreateVPBB(LoopForBB->getLoopPredecessor()),
-                             getOrCreateVPBB(LoopForBB->getLoopLatch())});
-    }
-
-    // Create VPInstructions for BB.
-    createVPInstructionsForVPBB(VPBB, BB);
-
-    if (BB == TheLoop->getLoopLatch()) {
-      VPBasicBlock *HeaderVPBB = getOrCreateVPBB(LoopForBB->getHeader());
-      VPBlockUtils::connectBlocks(VPBB, HeaderVPBB);
-      continue;
-    }
-
-    // Set VPBB successors. We create empty VPBBs for successors if they don't
-    // exist already. Recipes will be created when the successor is visited
-    // during the RPO traversal.
-    if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-      SmallVector<VPBlockBase *> Succs = {
-          getOrCreateVPBB(SI->getDefaultDest())};
-      for (auto Case : SI->cases())
-        Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor()));
-      VPBB->setSuccessors(Succs);
-      continue;
-    }
-    auto *BI = cast<BranchInst>(BB->getTerminator());
-    unsigned NumSuccs = succ_size(BB);
-    if (NumSuccs == 1) {
-      auto *Successor = getOrCreateVPBB(BB->getSingleSuccessor());
-      VPBB->setOneSuccessor(isHeaderVPBB(Successor)
-                                ? Successor->getParent()
-                                : static_cast<VPBlockBase *>(Successor));
-      continue;
-    }
-    assert(BI->isConditional() && NumSuccs == 2 && BI->isConditional() &&
-           "block must have conditional branch with 2 successors");
-
-    BasicBlock *IRSucc0 = BI->getSuccessor(0);
-    BasicBlock *IRSucc1 = BI->getSuccessor(1);
-    VPBasicBlock *Successor0 = getOrCreateVPBB(IRSucc0);
-    VPBasicBlock *Successor1 = getOrCreateVPBB(IRSucc1);
-
-    // Don't connect any blocks outside the current loop except the latch, which
-    // is handled below.
-    if (LoopForBB &&
-        (LoopForBB == TheLoop || BB != LoopForBB->getLoopLatch())) {
-      if (!LoopForBB->contains(IRSucc0)) {
-        VPBB->setOneSuccessor(Successor1);
-        continue;
-      }
-      if (!LoopForBB->contains(IRSucc1)) {
-        VPBB->setOneSuccessor(Successor0);
-        continue;
-      }
-    }
-
-    VPBB->setTwoSuccessors(Successor0, Successor1);
-  }
-
-  // 2. The whole CFG has been built at this point so all the input Values must
-  // have a VPlan counterpart. Fix VPlan header phi by adding their
-  // corresponding VPlan operands.
-  fixHeaderPhis();
-
-  VPBlockUtils::connectBlocks(Plan.getEntry(),
-                              getOrCreateVPBB(TheLoop->getHeader()));
-
-  for (const auto &[IRBB, VPB] : BB2VPBB)
-    VPB2IRBB[VPB] = IRBB;
-
-  LLVM_DEBUG(Plan.setName("Plain CFG\n"); dbgs() << Plan);
-}
-
-void VPlanHCFGBuilder::buildPlainCFG() {
-  PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
-  PCFGBuilder.buildPlainCFG(VPB2IRBB);
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
deleted file mode 100644
index f2e90d3f4d9b3..0000000000000
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ /dev/null
@@ -1,73 +0,0 @@
-//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines the VPlanHCFGBuilder class which contains the public
-/// interface (buildHierarchicalCFG) to build a VPlan-based Hierarchical CFG
-/// (H-CFG) for an incoming IR.
-///
-/// A H-CFG in VPlan is a control-flow graph whose nodes are VPBasicBlocks
-/// and/or VPRegionBlocks (i.e., other H-CFGs). The outermost H-CFG of a VPlan
-/// consists of a VPRegionBlock, denoted Top Region, which encloses any other
-/// VPBlockBase in the H-CFG. This guarantees that any VPBlockBase in the H-CFG
-/// other than the Top Region will have a parent VPRegionBlock and allows us
-/// to easily add more nodes before/after the main vector loop (such as the
-/// reduction epilogue).
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
-
-#include "llvm/ADT/DenseMap.h"
-
-namespace llvm {
-
-class Loop;
-class LoopInfo;
-class VPlan;
-class VPlanTestIRBase;
-class VPBlockBase;
-class BasicBlock;
-
-/// Main class to build the VPlan H-CFG for an incoming IR.
-class VPlanHCFGBuilder {
-  friend VPlanTestIRBase;
-
-private:
-  // The outermost loop of the input loop nest considered for vectorization.
-  Loop *TheLoop;
-
-  // Loop Info analysis.
-  LoopInfo *LI;
-
-  // The VPlan that will contain the H-CFG we are building.
-  VPlan &Plan;
-
-  /// Map of create VP blocks to their input IR basic blocks, if they have been
-  /// created for a input IR basic block.
-  DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
-
-public:
-  VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
-      : TheLoop(Lp), LI(LI), Plan(P) {}
-
-  /// Build plain CFG for TheLoop and connects it to Plan's entry.
-  void buildPlainCFG();
-
-  /// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if
-  /// there is no such corresponding block.
-  /// FIXME: This is a temporary workaround to drive the createBlockInMask.
-  /// Remove once mask creation is done on VPlan.
-  BasicBlock *getIRBBForVPB(const VPBlockBase *VPB) const {
-    return VPB2IRBB.lookup(VPB);
-  }
-};
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index c23ff38265670..15532ee44bc43 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -52,6 +52,10 @@ struct VPlanTransforms {
       verifyVPlanIsValid(Plan);
   }
 
+  static std::unique_ptr<VPlan>
+  buildPlainCFG(Loop *TheLoop, LoopInfo &LI,
+                DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
+
   /// Introduce the top-level VPRegionBlock for the main loop in \p Plan. Coming
   /// into this function, \p Plan's top-level loop is modeled using a plain CFG.
   /// This transform wraps the plain CFG of the top-level loop within a
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
index cd8bd4a3565e4..1ffd1a6a7a9b9 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -8,7 +8,6 @@
 
 #include "../lib/Transforms/Vectorize/VPlanSLP.h"
 #include "../lib/Transforms/Vectorize/VPlan.h"
-#include "../lib/Transforms/Vectorize/VPlanHCFGBuilder.h"
 #include "VPlanTestBase.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index 92961e44c5e54..5b851499033a2 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -13,7 +13,6 @@
 #define LLVM_UNITTESTS_TRANSFORMS_VECTORIZE_VPLANTESTBASE_H
 
 #include "../lib/Transforms/Vectorize/VPlan.h"
-#include "../lib/Transforms/Vectorize/VPlanHCFGBuilder.h"
 #include "../lib/Transforms/Vectorize/VPlanTransforms.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -71,9 +70,8 @@ class VPlanTestIRBase : public testing::Test {
 
     Loop *L = LI->getLoopFor(LoopHeader);
     PredicatedScalarEvolution PSE(*SE, *L);
-    auto Plan = std::make_unique<VPlan>(L);
-    VPlanHCFGBuilder HCFGBuilder(L, LI.get(), *Plan);
-    HCFGBuilder.buildPlainCFG();
+    DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
+    auto Plan = VPlanTransforms::buildPlainCFG(L, *LI, VPB2IRBB);
     VPlanTransforms::introduceTopLevelVectorLoopRegion(
         *Plan, IntegerType::get(*Ctx, 64), PSE, true, false, L);
     return Plan;

>From 24777562f06c4bed44186362910f21d639c7b4c1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 23 Mar 2025 14:16:27 +0000
Subject: [PATCH 3/5] [VPlan] Add exit opernds early.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 +----
 .../Vectorize/VPlanConstruction.cpp           | 23 +++++++++++++++++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  2 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 18 +++++++--------
 4 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 462dac7259437..d44aa55867224 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9141,11 +9141,7 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
         continue;
       }
 
-      PHINode &ExitPhi = ExitIRI->getIRPhi();
-      BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
-      Value *IncomingValue = ExitPhi.getIncomingValueForBlock(ExitingBB);
-      VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
-      ExitIRI->addOperand(V);
+      VPValue *V = ExitIRI->getOperand(0);
       if (V->isLiveIn())
         continue;
       assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 4303375e33ed2..cddd723e0c1ff 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -375,6 +375,24 @@ void PlainCFGBuilder::buildPlainCFG(
   VPBlockUtils::connectBlocks(Plan.getEntry(),
                               getOrCreateVPBB(TheLoop->getHeader()));
 
+  for (auto *EB : Plan.getExitBlocks()) {
+    BasicBlock *IRBB = EB->getIRBasicBlock();
+    for (VPRecipeBase &R : *EB) {
+      auto *PhiR = cast<VPIRInstruction>(&R);
+      auto *Phi = dyn_cast<PHINode>(&PhiR->getInstruction());
+      if (!Phi)
+        break;
+      for (Value *Inc : Phi->incoming_values())
+        PhiR->addOperand(getOrCreateVPOperand(Inc));
+      if (R.getNumOperands() > 1 &&
+          Phi->getIncomingBlock(0) != TheLoop->getLoopLatch()) {
+        VPValue *Tmp = R.getOperand(0);
+        R.setOperand(0, R.getOperand(1));
+        R.setOperand(1, Tmp);
+      }
+    }
+  }
+
   for (const auto &[IRBB, VPB] : BB2VPBB)
     VPB2IRBB[VPB] = IRBB;
 
@@ -466,6 +484,11 @@ void VPlanTransforms::introduceTopLevelVectorLoopRegion(
   VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader());
   if (!RequiresScalarEpilogueCheck) {
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+    for (auto *EB : Plan.getExitBlocks()) {
+      for (VPRecipeBase &R : *EB)
+        for (unsigned Idx = 0; Idx != R.getNumOperands(); ++Idx)
+          R.setOperand(Idx, Plan.getOrAddLiveIn(PoisonValue::get(InductionTy)));
+    }
     return;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f2d3b1588229a..d1ffab1ca4174 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1089,7 +1089,7 @@ InstructionCost VPIRInstruction::computeCost(ElementCount VF,
 void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) {
   assert(isa<PHINode>(getInstruction()) &&
          "can only add exiting operands to phi nodes");
-  assert(getNumOperands() == 1 && "must have a single operand");
+  // assert(getNumOperands() == 1 && "must have a single operand");
   VPValue *Exiting = getOperand(0);
   if (!Exiting->isLiveIn()) {
     LLVMContext &Ctx = getInstruction().getContext();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8852540aec931..23f8afc1df806 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2317,29 +2317,27 @@ void VPlanTransforms::handleUncountableEarlyExit(
     if (!ExitIRI)
       break;
 
-    PHINode &ExitPhi = ExitIRI->getIRPhi();
-    VPValue *IncomingFromEarlyExit = RecipeBuilder.getVPValueOrAddLiveIn(
-        ExitPhi.getIncomingValueForBlock(UncountableExitingBlock));
+    unsigned EarlyExitIdx = 0;
 
     if (OrigLoop->getUniqueExitBlock()) {
+      EarlyExitIdx = 1;
       // If there's a unique exit block, VPEarlyExitBlock has 2 predecessors
       // (MiddleVPBB and NewMiddle). Add the incoming value from MiddleVPBB
       // which is coming from the original latch.
-      VPValue *IncomingFromLatch = RecipeBuilder.getVPValueOrAddLiveIn(
-          ExitPhi.getIncomingValueForBlock(OrigLoop->getLoopLatch()));
-      ExitIRI->addOperand(IncomingFromLatch);
       ExitIRI->extractLastLaneOfOperand(MiddleBuilder);
     }
+    VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
     // Add the incoming value from the early exit.
     if (!IncomingFromEarlyExit->isLiveIn() && !Plan.hasScalarVFOnly()) {
       VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
           VPInstruction::FirstActiveLane, {EarlyExitTakenCond}, nullptr,
           "first.active.lane");
-      IncomingFromEarlyExit = EarlyExitB.createNaryOp(
-          Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
-          nullptr, "early.exit.value");
+      ExitIRI->setOperand(
+          EarlyExitIdx,
+          EarlyExitB.createNaryOp(Instruction::ExtractElement,
+                                  {IncomingFromEarlyExit, FirstActiveLane},
+                                  nullptr, "early.exit.value"));
     }
-    ExitIRI->addOperand(IncomingFromEarlyExit);
   }
   MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
 

>From a6a8906d8a59cd19e5e42c9df9acacfe8b668d79 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 23 Mar 2025 14:50:48 +0000
Subject: [PATCH 4/5] [VPlan] Retain exit conditions early

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 35 +++++++++--
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  2 +-
 .../Vectorize/VPlanConstruction.cpp           | 61 +++++++++----------
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 36 +++++++----
 .../Transforms/Vectorize/VPlanTransforms.h    |  3 +-
 .../vplan-printing-outer-loop.ll              |  5 +-
 6 files changed, 89 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d44aa55867224..710fa31ac4f19 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9324,6 +9324,24 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
 
+  if (auto *UncountableExitingBlock =
+          Legal->getUncountableEarlyExitingBlock()) {
+    VPlanTransforms::runPass(VPlanTransforms::handleUncountableEarlyExit, *Plan,
+                             *PSE.getSE(), OrigLoop, UncountableExitingBlock);
+  } else {
+    SmallPtrSet<VPBlockBase *, 2> ExitBlocks(Plan->getExitBlocks().begin(),
+                                             Plan->getExitBlocks().end());
+    for (VPBlockBase *VPBB : to_vector(
+             vp_depth_first_shallow(Plan->getVectorLoopRegion()->getEntry()))) {
+      for (VPBlockBase *EB : ExitBlocks) {
+        if (is_contained(VPBB->getSuccessors(), EB)) {
+          cast<VPBasicBlock>(VPBB)->getTerminator()->eraseFromParent();
+          VPBlockUtils::disconnectBlocks(VPBB, EB);
+        }
+      }
+    }
+  }
+
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
                                 Builder);
 
@@ -9502,12 +9520,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     R->setOperand(1, WideIV->getStepValue());
   }
 
-  if (auto *UncountableExitingBlock =
-          Legal->getUncountableEarlyExitingBlock()) {
-    VPlanTransforms::runPass(VPlanTransforms::handleUncountableEarlyExit, *Plan,
-                             *PSE.getSE(), OrigLoop, UncountableExitingBlock,
-                             RecipeBuilder);
-  }
   DenseMap<VPValue *, VPValue *> IVEndValues;
   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
   SetVector<VPIRInstruction *> ExitUsersToFix =
@@ -9604,6 +9616,17 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
   auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
   VPlanTransforms::introduceTopLevelVectorLoopRegion(
       *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop);
+  SmallPtrSet<VPBlockBase *, 2> ExitBlocks(Plan->getExitBlocks().begin(),
+                                           Plan->getExitBlocks().end());
+  for (VPBlockBase *VPBB : to_vector(
+           vp_depth_first_shallow(Plan->getVectorLoopRegion()->getEntry()))) {
+    for (VPBlockBase *EB : ExitBlocks) {
+      if (is_contained(VPBB->getSuccessors(), EB)) {
+        cast<VPBasicBlock>(VPBB)->getTerminator()->eraseFromParent();
+        VPBlockUtils::disconnectBlocks(VPBB, EB);
+      }
+    }
+  }
 
   for (ElementCount VF : Range)
     Plan->addVF(VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8b53c559f6533..b128565b30ad2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -855,7 +855,7 @@ VPlan::VPlan(Loop *L) {
   ScalarHeader = createVPIRBasicBlock(L->getHeader());
 
   SmallVector<BasicBlock *> IRExitBlocks;
-  L->getExitBlocks(IRExitBlocks);
+  L->getUniqueExitBlocks(IRExitBlocks);
   for (BasicBlock *EB : IRExitBlocks)
     ExitBlocks.push_back(createVPIRBasicBlock(EB));
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index cddd723e0c1ff..e36e4f9268d09 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -123,6 +123,9 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
     return VPBB;
   }
 
+  if (!TheLoop->contains(BB))
+    return Plan.getExitBlock(BB);
+
   // Create new VPBB.
   StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName();
   LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
@@ -156,14 +159,6 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) {
     // Instruction definition is in outermost loop PH.
     return false;
 
-  // Check whether Instruction definition is in a loop exit.
-  SmallVector<BasicBlock *> ExitBlocks;
-  TheLoop->getExitBlocks(ExitBlocks);
-  if (is_contained(ExitBlocks, InstParent)) {
-    // Instruction definition is in outermost loop exit.
-    return false;
-  }
-
   // Check whether Instruction definition is in loop body.
   return !TheLoop->contains(Inst);
 }
@@ -212,11 +207,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
            "Instruction shouldn't have been visited.");
 
     if (auto *Br = dyn_cast<BranchInst>(Inst)) {
-      if (TheLoop->getLoopLatch() == BB ||
-          any_of(successors(BB),
-                 [this](BasicBlock *Succ) { return !TheLoop->contains(Succ); }))
+      if (TheLoop->getLoopLatch() == BB)
         continue;
-
       // Conditional branch instruction are represented using BranchOnCond
       // recipes.
       if (Br->isConditional()) {
@@ -319,6 +311,13 @@ void PlainCFGBuilder::buildPlainCFG(
     if (BB == TheLoop->getLoopLatch()) {
       VPBasicBlock *HeaderVPBB = getOrCreateVPBB(LoopForBB->getHeader());
       VPBlockUtils::connectBlocks(VPBB, HeaderVPBB);
+      assert(isa<BranchInst>(BB->getTerminator()) && "latch must be terminated by branch"
+                            );
+      for (BasicBlock *IRSucc : successors(BB)) {
+        VPBasicBlock *VPSucc = getOrCreateVPBB(IRSucc);
+        if (VPSucc != HeaderVPBB)
+          VPBB->getSuccessors().push_back(VPSucc);
+      }
       continue;
     }
 
@@ -349,24 +348,12 @@ void PlainCFGBuilder::buildPlainCFG(
     BasicBlock *IRSucc1 = BI->getSuccessor(1);
     VPBasicBlock *Successor0 = getOrCreateVPBB(IRSucc0);
     VPBasicBlock *Successor1 = getOrCreateVPBB(IRSucc1);
-
-    // Don't connect any blocks outside the current loop except the latch, which
-    // is handled below.
-    if (LoopForBB &&
-        (LoopForBB == TheLoop || BB != LoopForBB->getLoopLatch())) {
-      if (!LoopForBB->contains(IRSucc0)) {
-        VPBB->setOneSuccessor(Successor1);
-        continue;
-      }
-      if (!LoopForBB->contains(IRSucc1)) {
-        VPBB->setOneSuccessor(Successor0);
-        continue;
-      }
-    }
-
     VPBB->setTwoSuccessors(Successor0, Successor1);
   }
 
+  for (auto *EB : Plan.getExitBlocks()) {
+    setVPBBPredsFromBB(EB, EB->getIRBasicBlock());
+  }
   // 2. The whole CFG has been built at this point so all the input Values must
   // have a VPlan counterpart. Fix VPlan header phi by adding their
   // corresponding VPlan operands.
@@ -448,6 +435,11 @@ void VPlanTransforms::introduceTopLevelVectorLoopRegion(
   VPBasicBlock *OriginalLatch =
       cast<VPBasicBlock>(HeaderVPBB->getSinglePredecessor());
   VPBlockUtils::disconnectBlocks(OriginalLatch, HeaderVPBB);
+  if (auto *RemainingSucc = OriginalLatch->getSingleSuccessor())
+    VPBlockUtils::disconnectBlocks(OriginalLatch,
+                                   RemainingSucc);
+  else
+    assert(OriginalLatch->getSuccessors().empty() && "Unsupported number of successors");
   VPBasicBlock *VecPreheader = Plan.createVPBasicBlock("vector.ph");
   VPBlockUtils::connectBlocks(Plan.getEntry(), VecPreheader);
   assert(OriginalLatch->getNumSuccessors() == 0 &&
@@ -473,8 +465,12 @@ void VPlanTransforms::introduceTopLevelVectorLoopRegion(
       HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/);
   // All VPBB's reachable shallowly from HeaderVPBB belong to top level loop,
   // because VPlan is expected to end at top level latch.
-  for (VPBlockBase *VPBB : vp_depth_first_shallow(HeaderVPBB))
-    VPBB->setParent(TopRegion);
+  SmallPtrSet<VPBlockBase *, 2> ExitBlocks(Plan.getExitBlocks().begin(),
+                                           Plan.getExitBlocks().end());
+  for (VPBlockBase *VPBB : vp_depth_first_shallow(HeaderVPBB)) {
+    if (!ExitBlocks.contains(VPBB))
+      VPBB->setParent(TopRegion);
+  }
 
   VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
   VPBasicBlock *MiddleVPBB = Plan.createVPBasicBlock("middle.block");
@@ -503,7 +499,7 @@ void VPlanTransforms::introduceTopLevelVectorLoopRegion(
   BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
   auto *VPExitBlock = Plan.getExitBlock(IRExitBlock);
   // The connection order corresponds to the operands of the conditional branch.
-  VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+  VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
   VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
 
   auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
@@ -522,5 +518,8 @@ void VPlanTransforms::introduceTopLevelVectorLoopRegion(
   Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp},
                        ScalarLatchTerm->getDebugLoc());
 
-  introduceInnerLoopRegions(Plan);
+  if (all_of(Plan.getExitBlocks(), [MiddleVPBB](VPBlockBase *EB) {
+        return EB->getSinglePredecessor() == MiddleVPBB;
+      }))
+    introduceInnerLoopRegions(Plan);
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 23f8afc1df806..2b90adc82182c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2275,7 +2275,7 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
 
 void VPlanTransforms::handleUncountableEarlyExit(
     VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
-    BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) {
+    BasicBlock *UncountableExitingBlock) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   auto *LatchVPBB = cast<VPBasicBlock>(LoopRegion->getExiting());
   VPBuilder Builder(LatchVPBB->getTerminator());
@@ -2286,17 +2286,29 @@ void VPlanTransforms::handleUncountableEarlyExit(
   // tracks if the uncountable early exit has been taken. Also split the middle
   // block and have it conditionally branch to the early exit block if
   // EarlyExitTaken.
-  auto *EarlyExitingBranch =
-      cast<BranchInst>(UncountableExitingBlock->getTerminator());
-  BasicBlock *TrueSucc = EarlyExitingBranch->getSuccessor(0);
-  BasicBlock *FalseSucc = EarlyExitingBranch->getSuccessor(1);
-  BasicBlock *EarlyExitIRBB =
-      !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc;
-  VPIRBasicBlock *VPEarlyExitBlock = Plan.getExitBlock(EarlyExitIRBB);
-
-  VPValue *EarlyExitNotTakenCond = RecipeBuilder.getBlockInMask(
-      OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
-  auto *EarlyExitTakenCond = Builder.createNot(EarlyExitNotTakenCond);
+  VPBasicBlock *EEB = nullptr;
+  for (auto *EB : Plan.getExitBlocks()) {
+    for (VPBlockBase *Pred : EB->getPredecessors()) {
+      if (Pred != MiddleVPBB) {
+        EEB = cast<VPBasicBlock>(Pred);
+        break;
+      }
+    }
+  }
+
+  VPBlockBase *TrueSucc = EEB->getSuccessors()[0];
+  VPBlockBase *FalseSucc = EEB->getSuccessors()[1];
+  auto *VPEarlyExitBlock =
+      cast<VPIRBasicBlock>(TrueSucc->getParent() ? FalseSucc : TrueSucc);
+
+  VPValue *EarlyExitCond = EEB->getTerminator()->getOperand(0);
+  auto *EarlyExitTakenCond = TrueSucc == VPEarlyExitBlock
+                                 ? EarlyExitCond
+                                 : Builder.createNot(EarlyExitCond);
+
+  EEB->getTerminator()->eraseFromParent();
+  VPBlockUtils::disconnectBlocks(EEB, VPEarlyExitBlock);
+
   IsEarlyExitTaken =
       Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 15532ee44bc43..e2e20b4fda18e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -177,8 +177,7 @@ struct VPlanTransforms {
   ///    if taken.
   static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
                                          Loop *OrigLoop,
-                                         BasicBlock *UncountableExitingBlock,
-                                         VPRecipeBuilder &RecipeBuilder);
+                                         BasicBlock *UncountableExitingBlock);
 
   /// Lower abstract recipes to concrete ones, that can be codegen'd.
   static void convertToConcreteRecipes(VPlan &Plan);
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
index b4b6d3d760349..2ee20180e8246 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
@@ -31,7 +31,10 @@ define void @foo(i64 %n) {
 ; CHECK-NEXT: outer.latch:
 ; CHECK-NEXT:   EMIT ir<%outer.iv.next> = add ir<%outer.iv>, ir<1>
 ; CHECK-NEXT:   EMIT ir<%outer.ec> = icmp ir<%outer.iv.next>, ir<8>
-; CHECK-NEXT: Successor(s): vector.body
+; CHECK-NEXT: Successor(s): vector.body,  ir-bb<exit>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 entry:
   br label %outer.header

>From 915b55b70643e953d9f61aab3ce988a8e634ec99 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 22 Feb 2025 19:15:32 +0000
Subject: [PATCH 5/5] [VPlan] Move predication to VPlanTransform (NFC) (WIP).

This patch moves the logic to predicate and linearize a VPlan to a
dedicated VPlan transform.

The main logic to perform predication is ready to review, although
there are few things to note that should be improved, either directly in
the PR or in the future:
 * Edge and block masks are cached in VPRecipeBuilder, so they can be
   accessed during recipe construction. A better alternative may be to
   add mask operands to all VPInstructions that need them and use that
   during recipe construction
 * The mask caching in a map also means that this map needs updating
   each time a new recipe replaces a VPInstruction; this would also be
   handled by adding mask operands.

Currently this is still WIP due to early-exit loop handling not working
due to the exit conditions not being available in the initial VPlans.
This will be fixed with https://github.com/llvm/llvm-project/pull/128419
and follow-ups

All tests except early-exit loops are passing
---
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 332 ++++--------------
 .../Transforms/Vectorize/VPRecipeBuilder.h    |  45 +--
 .../Vectorize/VPlanConstruction.cpp           |  29 +-
 .../Transforms/Vectorize/VPlanPredicator.cpp  | 257 ++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |   7 +-
 .../Transforms/Vectorize/VPlanTestBase.h      |   3 +-
 7 files changed, 356 insertions(+), 318 deletions(-)
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp

diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 0dc6a7d2f594f..e6c7142edd100 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -24,6 +24,7 @@ add_llvm_component_library(LLVMVectorize
   VPlan.cpp
   VPlanAnalysis.cpp
   VPlanConstruction.cpp
+  VPlanPredicator.cpp
   VPlanRecipes.cpp
   VPlanSLP.cpp
   VPlanTransforms.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 710fa31ac4f19..bdc83cb3f4db6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8110,185 +8110,6 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
   });
 }
 
-void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
-  BasicBlock *Src = SI->getParent();
-  assert(!OrigLoop->isLoopExiting(Src) &&
-         all_of(successors(Src),
-                [this](BasicBlock *Succ) {
-                  return OrigLoop->getHeader() != Succ;
-                }) &&
-         "unsupported switch either exiting loop or continuing to header");
-  // Create masks where the terminator in Src is a switch. We create mask for
-  // all edges at the same time. This is more efficient, as we can create and
-  // collect compares for all cases once.
-  VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
-  BasicBlock *DefaultDst = SI->getDefaultDest();
-  MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
-  for (auto &C : SI->cases()) {
-    BasicBlock *Dst = C.getCaseSuccessor();
-    assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
-    // Cases whose destination is the same as default are redundant and can be
-    // ignored - they will get there anyhow.
-    if (Dst == DefaultDst)
-      continue;
-    auto &Compares = Dst2Compares[Dst];
-    VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
-    Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
-  }
-
-  // We need to handle 2 separate cases below for all entries in Dst2Compares,
-  // which excludes destinations matching the default destination.
-  VPValue *SrcMask = getBlockInMask(Src);
-  VPValue *DefaultMask = nullptr;
-  for (const auto &[Dst, Conds] : Dst2Compares) {
-    // 1. Dst is not the default destination. Dst is reached if any of the cases
-    // with destination == Dst are taken. Join the conditions for each case
-    // whose destination == Dst using an OR.
-    VPValue *Mask = Conds[0];
-    for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
-      Mask = Builder.createOr(Mask, V);
-    if (SrcMask)
-      Mask = Builder.createLogicalAnd(SrcMask, Mask);
-    EdgeMaskCache[{Src, Dst}] = Mask;
-
-    // 2. Create the mask for the default destination, which is reached if none
-    // of the cases with destination != default destination are taken. Join the
-    // conditions for each case where the destination is != Dst using an OR and
-    // negate it.
-    DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
-  }
-
-  if (DefaultMask) {
-    DefaultMask = Builder.createNot(DefaultMask);
-    if (SrcMask)
-      DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
-  }
-  EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
-}
-
-VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
-  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
-  // Look for cached value.
-  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
-  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
-  if (ECEntryIt != EdgeMaskCache.end())
-    return ECEntryIt->second;
-
-  if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
-    createSwitchEdgeMasks(SI);
-    assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
-    return EdgeMaskCache[Edge];
-  }
-
-  VPValue *SrcMask = getBlockInMask(Src);
-
-  // The terminator has to be a branch inst!
-  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
-  assert(BI && "Unexpected terminator found");
-  if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
-    return EdgeMaskCache[Edge] = SrcMask;
-
-  // If source is an exiting block, we know the exit edge is dynamically dead
-  // in the vector loop, and thus we don't need to restrict the mask.  Avoid
-  // adding uses of an otherwise potentially dead instruction unless we are
-  // vectorizing a loop with uncountable exits. In that case, we always
-  // materialize the mask.
-  if (OrigLoop->isLoopExiting(Src) &&
-      Src != Legal->getUncountableEarlyExitingBlock())
-    return EdgeMaskCache[Edge] = SrcMask;
-
-  VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
-  assert(EdgeMask && "No Edge Mask found for condition");
-
-  if (BI->getSuccessor(0) != Dst)
-    EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
-
-  if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
-    // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
-    // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
-    // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
-    EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
-  }
-
-  return EdgeMaskCache[Edge] = EdgeMask;
-}
-
-VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
-  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
-  // Look for cached value.
-  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
-  EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
-  assert(ECEntryIt != EdgeMaskCache.end() &&
-         "looking up mask for edge which has not been created");
-  return ECEntryIt->second;
-}
-
-void VPRecipeBuilder::createHeaderMask() {
-  BasicBlock *Header = OrigLoop->getHeader();
-
-  // When not folding the tail, use nullptr to model all-true mask.
-  if (!CM.foldTailByMasking()) {
-    BlockMaskCache[Header] = nullptr;
-    return;
-  }
-
-  // Introduce the early-exit compare IV <= BTC to form header block mask.
-  // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
-  // constructing the desired canonical IV in the header block as its first
-  // non-phi instructions.
-
-  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
-  auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
-  auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
-  HeaderVPBB->insert(IV, NewInsertionPoint);
-
-  VPBuilder::InsertPointGuard Guard(Builder);
-  Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
-  VPValue *BlockMask = nullptr;
-  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
-  BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
-  BlockMaskCache[Header] = BlockMask;
-}
-
-VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
-  // Return the cached value.
-  BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
-  assert(BCEntryIt != BlockMaskCache.end() &&
-         "Trying to access mask for block without one.");
-  return BCEntryIt->second;
-}
-
-void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
-  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
-  assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
-  assert(OrigLoop->getHeader() != BB &&
-         "Loop header must have cached block mask");
-
-  // All-one mask is modelled as no-mask following the convention for masked
-  // load/store/gather/scatter. Initialize BlockMask to no-mask.
-  VPValue *BlockMask = nullptr;
-  // This is the block mask. We OR all unique incoming edges.
-  for (auto *Predecessor :
-       SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) {
-    VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
-    if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
-      BlockMaskCache[BB] = EdgeMask;
-      return;
-    }
-
-    if (!BlockMask) { // BlockMask has its initialized nullptr value.
-      BlockMask = EdgeMask;
-      continue;
-    }
-
-    BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
-  }
-
-  BlockMaskCache[BB] = BlockMask;
-}
-
 VPWidenMemoryRecipe *
 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
                                   VFRange &Range) {
@@ -8313,7 +8134,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
 
   VPValue *Mask = nullptr;
   if (Legal->isMaskRequired(I))
-    Mask = getBlockInMask(I->getParent());
+    Mask = getBlockInMask(Builder.getInsertBlock());
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
@@ -8432,38 +8253,6 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
   return nullptr;
 }
 
-VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
-                                           ArrayRef<VPValue *> Operands) {
-  unsigned NumIncoming = Phi->getNumIncomingValues();
-
-  // We know that all PHIs in non-header blocks are converted into selects, so
-  // we don't have to worry about the insertion order and we can just use the
-  // builder. At this point we generate the predication tree. There may be
-  // duplications since this is a simple recursive scan, but future
-  // optimizations will clean it up.
-
-  // Map incoming IR BasicBlocks to incoming VPValues, for lookup below.
-  // TODO: Add operands and masks in order from the VPlan predecessors.
-  DenseMap<BasicBlock *, VPValue *> VPIncomingValues;
-  for (const auto &[Idx, Pred] : enumerate(predecessors(Phi->getParent())))
-    VPIncomingValues[Pred] = Operands[Idx];
-
-  SmallVector<VPValue *, 2> OperandsWithMask;
-  for (unsigned In = 0; In < NumIncoming; In++) {
-    BasicBlock *Pred = Phi->getIncomingBlock(In);
-    OperandsWithMask.push_back(VPIncomingValues.lookup(Pred));
-    VPValue *EdgeMask = getEdgeMask(Pred, Phi->getParent());
-    if (!EdgeMask) {
-      assert(In == 0 && "Both null and non-null edge masks found");
-      assert(all_equal(Operands) &&
-             "Distinct incoming values with one having a full mask");
-      break;
-    }
-    OperandsWithMask.push_back(EdgeMask);
-  }
-  return new VPBlendRecipe(Phi, OperandsWithMask);
-}
-
 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
                                                    ArrayRef<VPValue *> Operands,
                                                    VFRange &Range) {
@@ -8539,7 +8328,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
       //      all-true mask.
       VPValue *Mask = nullptr;
       if (Legal->isMaskRequired(CI))
-        Mask = getBlockInMask(CI->getParent());
+        Mask = getBlockInMask(Builder.getInsertBlock());
       else
         Mask = Plan.getOrAddLiveIn(
             ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
@@ -8581,7 +8370,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     // div/rem operation itself.  Otherwise fall through to general handling below.
     if (CM.isPredicatedInst(I)) {
       SmallVector<VPValue *> Ops(Operands);
-      VPValue *Mask = getBlockInMask(I->getParent());
+      VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
       VPValue *One =
           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
@@ -8663,7 +8452,7 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
   // In case of predicated execution (due to tail-folding, or conditional
   // execution, or both), pass the relevant mask.
   if (Legal->isMaskRequired(HI->Store))
-    HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
+    HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
 
   return new VPHistogramRecipe(Opcode,
                                make_range(HGramOps.begin(), HGramOps.end()),
@@ -8719,7 +8508,7 @@ VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands,
     // added initially. Masked replicate recipes will later be placed under an
     // if-then construct to prevent side-effects. Generate recipes to compute
     // the block mask for this region.
-    BlockInMask = getBlockInMask(I->getParent());
+    BlockInMask = getBlockInMask(Builder.getInsertBlock());
   }
 
   // Note that there is some custom logic to mark some intrinsics as uniform
@@ -8852,9 +8641,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
   // nodes, calls and memory operations.
   VPRecipeBase *Recipe;
   if (auto *Phi = dyn_cast<PHINode>(Instr)) {
-    if (Phi->getParent() != OrigLoop->getHeader())
-      return tryToBlend(Phi, Operands);
-
+    assert(Phi->getParent() == OrigLoop->getHeader() &&
+           "Non-header phis should have been handled during predication");
     assert(Operands.size() == 2 && "Must have 2 operands for header phis");
     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
       return Recipe;
@@ -8959,7 +8747,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
             ReductionOpcode == Instruction::Sub) &&
            "Expected an ADD or SUB operation for predicated partial "
            "reductions (because the neutral element in the mask is zero)!");
-    VPValue *Mask = getBlockInMask(Reduction->getParent());
+    VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
     VPValue *Zero =
         Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
     BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
@@ -9301,8 +9089,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
             return !CM.requiresScalarEpilogue(VF.isVector());
           },
           Range);
-  DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
-  auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
+  auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
   VPlanTransforms::introduceTopLevelVectorLoopRegion(
       *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
       CM.foldTailByMasking(), OrigLoop);
@@ -9342,9 +9129,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     }
   }
 
-  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
-                                Builder);
-
   // ---------------------------------------------------------------------------
   // Pre-construction: record ingredients whose recipes we'll need to further
   // process after constructing the initial VPlan.
@@ -9385,39 +9169,55 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         return Legal->blockNeedsPredication(BB) || NeedsBlends;
       });
 
-  RecipeBuilder.collectScaledReductions(Range);
 
   auto *MiddleVPBB = Plan->getMiddleBlock();
 
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
+                                Builder);
+  if (NeedsMasks) {
+    VPlanTransforms::predicateAndLinearize(*Plan, CM.foldTailByMasking(),
+                                           RecipeBuilder);
+  }
+
+  {
+    VPBlockBase *PrevVPBB = nullptr;
+    VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
+    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+        RPOT(Header);
+
+    for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+      // Handle VPBBs down to the latch.
+      if (VPBB == LoopRegion->getExiting()) {
+        VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+        break;
+      }
+
+      auto Successors = to_vector(VPBB->getSuccessors());
+      if (Successors.size() > 1)
+        VPBB->getTerminator()->eraseFromParent();
+
+      // Flatten the CFG in the loop. Masks for blocks have already been
+      // generated and added to recipes as needed. To do so, first disconnect
+      // VPBB from its successors. Then connect VPBB to the previously visited
+      // VPBB.
+      for (auto *Succ : Successors)
+        VPBlockUtils::disconnectBlocks(VPBB, Succ);
+      if (PrevVPBB)
+        VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+
+      PrevVPBB = VPBB;
+    }
+  }
+
+  RecipeBuilder.collectScaledReductions(Range);
+
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       HeaderVPBB);
 
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
-  VPBlockBase *PrevVPBB = nullptr;
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
-    // Handle VPBBs down to the latch.
-    if (VPBB == LoopRegion->getExiting()) {
-      assert(!VPB2IRBB.contains(VPBB) &&
-             "the latch block shouldn't have a corresponding IRBB");
-      VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
-      break;
-    }
-
-    // Create mask based on the IR BB corresponding to VPBB.
-    // TODO: Predicate directly based on VPlan.
-    Builder.setInsertPoint(VPBB, VPBB->begin());
-    if (VPBB == HeaderVPBB) {
-      Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
-      RecipeBuilder.createHeaderMask();
-    } else if (NeedsMasks) {
-      // FIXME: At the moment, masks need to be placed at the beginning of the
-      // block, as blends introduced for phi nodes need to use it. The created
-      // blends should be sunk after the mask recipes.
-      RecipeBuilder.createBlockInMask(VPB2IRBB.lookup(VPBB));
-    }
-
     // Convert input VPInstructions to widened recipes.
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       auto *SingleDef = cast<VPSingleDefRecipe>(&R);
@@ -9427,7 +9227,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       // latter are added above for masking.
       // FIXME: Migrate code relying on the underlying instruction from VPlan0
       // to construct recipes below to not use the underlying instruction.
-      if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
+      if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
+              &R) ||
           (isa<VPInstruction>(&R) && !UnderlyingValue))
         continue;
 
@@ -9436,14 +9237,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
              UnderlyingValue && "unsupported recipe");
 
-      if (isa<VPInstruction>(&R) &&
-          (cast<VPInstruction>(&R)->getOpcode() ==
-               VPInstruction::BranchOnCond ||
-           (cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch))) {
-        R.eraseFromParent();
-        break;
-      }
-
       // TODO: Gradually replace uses of underlying instruction by analyses on
       // VPlan.
       Instruction *Instr = cast<Instruction>(UnderlyingValue);
@@ -9479,22 +9272,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       } else {
         Builder.insert(Recipe);
       }
-      if (Recipe->getNumDefinedValues() == 1)
+      if (Recipe->getNumDefinedValues() == 1) {
         SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
-      else
+        for (auto &[_, V] : RecipeBuilder.BlockMaskCache) {
+          if (V == SingleDef)
+            V = Recipe->getVPSingleValue();
+        }
+      } else
         assert(Recipe->getNumDefinedValues() == 0 &&
                "Unexpected multidef recipe");
       R.eraseFromParent();
     }
-
-    // Flatten the CFG in the loop. Masks for blocks have already been generated
-    // and added to recipes as needed. To do so, first disconnect VPBB from its
-    // successors. Then connect VPBB to the previously visited VPBB.
-    for (auto *Succ : to_vector(VPBB->getSuccessors()))
-      VPBlockUtils::disconnectBlocks(VPBB, Succ);
-    if (PrevVPBB)
-      VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
-    PrevVPBB = VPBB;
   }
 
   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
@@ -9612,8 +9400,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
   // the vectorization pipeline.
   assert(!OrigLoop->isInnermost());
   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
-  DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
-  auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
+  auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
   VPlanTransforms::introduceTopLevelVectorLoopRegion(
       *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop);
   SmallPtrSet<VPBlockBase *, 2> ExitBlocks(Plan->getExitBlocks().begin(),
@@ -9796,7 +9583,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       BasicBlock *BB = CurrentLinkI->getParent();
       VPValue *CondOp = nullptr;
       if (CM.blockNeedsPredicationForAnyReason(BB))
-        CondOp = RecipeBuilder.getBlockInMask(BB);
+        CondOp = RecipeBuilder.getBlockInMask(CurrentLink->getParent());
 
       // Non-FP RdxDescs will have all fast math flags set, so clear them.
       FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
@@ -9835,7 +9622,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     // different numbers of lanes. Partial reductions mask the input instead.
     if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
         !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
-      VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
+      VPValue *Cond =
+          RecipeBuilder.getBlockInMask(VectorLoopRegion->getEntryBasicBlock());
       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
              "reduction recipe must be defined before latch");
       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 334cfbad8bd7c..9900c4117c5f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -73,11 +73,14 @@ class VPRecipeBuilder {
   /// if-conversion currently takes place during VPlan-construction, so these
   /// caches are only used at that stage.
   using EdgeMaskCacheTy =
-      DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
-  using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
+      DenseMap<std::pair<VPBasicBlock *, VPBasicBlock *>, VPValue *>;
+  using BlockMaskCacheTy = DenseMap<VPBasicBlock *, VPValue *>;
   EdgeMaskCacheTy EdgeMaskCache;
+
+public:
   BlockMaskCacheTy BlockMaskCache;
 
+private:
   // VPlan construction support: Hold a mapping from ingredients to
   // their recipe.
   DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe;
@@ -114,11 +117,6 @@ class VPRecipeBuilder {
   tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands,
                                  VFRange &Range);
 
-  /// Handle non-loop phi nodes. Return a new VPBlendRecipe otherwise. Currently
-  /// all such phi nodes are turned into a sequence of select instructions as
-  /// the vectorizer currently performs full if-conversion.
-  VPBlendRecipe *tryToBlend(PHINode *Phi, ArrayRef<VPValue *> Operands);
-
   /// Handle call instructions. If \p CI can be widened for \p Range.Start,
   /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be
   /// decreased to ensure same decision from \p Range.Start to \p Range.End.
@@ -187,27 +185,20 @@ class VPRecipeBuilder {
     Ingredient2Recipe[I] = R;
   }
 
-  /// Create the mask for the vector loop header block.
-  void createHeaderMask();
-
-  /// A helper function that computes the predicate of the block BB, assuming
-  /// that the header block of the loop is set to True or the loop mask when
-  /// tail folding.
-  void createBlockInMask(BasicBlock *BB);
-
+  void setBlockInMask(VPBasicBlock *BB, VPValue *Mask) {
+    assert(!BlockMaskCache.contains(BB) && "Mask already set");
+    BlockMaskCache[BB] = Mask;
+  }
   /// Returns the *entry* mask for the block \p BB.
-  VPValue *getBlockInMask(BasicBlock *BB) const;
-
-  /// Create an edge mask for every destination of cases and/or default.
-  void createSwitchEdgeMasks(SwitchInst *SI);
-
-  /// A helper function that computes the predicate of the edge between SRC
-  /// and DST.
-  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
-
-  /// A helper that returns the previously computed predicate of the edge
-  /// between SRC and DST.
-  VPValue *getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const;
+  VPValue *getBlockInMask(VPBasicBlock *BB) const {
+    return BlockMaskCache.lookup(BB);
+  }
+  void setEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst, VPValue *Mask) {
+    EdgeMaskCache[{Src, Dst}] = Mask;
+  }
+  VPValue *getEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) const {
+    return EdgeMaskCache.lookup({Src, Dst});
+  }
 
   /// Return the recipe created for given ingredient.
   VPRecipeBase *getRecipe(Instruction *I) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index e36e4f9268d09..de9ef756b77b0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -65,7 +65,7 @@ class PlainCFGBuilder {
       : TheLoop(Lp), LI(LI), Plan(P) {}
 
   /// Build plain CFG for TheLoop  and connects it to Plan's entry.
-  void buildPlainCFG(DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
+  void buildPlainCFG();
 };
 } // anonymous namespace
 
@@ -258,10 +258,16 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       for (Value *Op : Inst->operands())
         VPOperands.push_back(getOrCreateVPOperand(Op));
 
-      // Build VPInstruction for any arbitrary Instruction without specific
-      // representation in VPlan.
-      NewR = cast<VPInstruction>(
-          VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+      if (auto *ICmp = dyn_cast<ICmpInst>(Inst)) {
+        NewR = cast<VPInstruction>(VPIRBuilder.createICmp(
+            ICmp->getPredicate(), VPOperands[0], VPOperands[1]));
+        NewR->setUnderlyingValue(ICmp);
+      } else {
+        // Build VPInstruction for any arbitrary Instruction without specific
+        // representation in VPlan.
+        NewR = cast<VPInstruction>(
+            VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+      }
     }
 
     IRDef2VPValue[Inst] = NewR;
@@ -269,8 +275,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
 }
 
 // Main interface to build the plain CFG.
-void PlainCFGBuilder::buildPlainCFG(
-    DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
+void PlainCFGBuilder::buildPlainCFG() {
 
   // 1. Scan the body of the loop in a topological order to visit each basic
   // block after having visited its predecessor basic blocks. Create a VPBB for
@@ -380,18 +385,14 @@ void PlainCFGBuilder::buildPlainCFG(
     }
   }
 
-  for (const auto &[IRBB, VPB] : BB2VPBB)
-    VPB2IRBB[VPB] = IRBB;
-
   LLVM_DEBUG(Plan.setName("Plain CFG\n"); dbgs() << Plan);
 }
 
-std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(
-    Loop *TheLoop, LoopInfo &LI,
-    DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
+std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(Loop *TheLoop,
+                                                      LoopInfo &LI) {
   auto Plan = std::make_unique<VPlan>(TheLoop);
   PlainCFGBuilder Builder(TheLoop, LI, *Plan);
-  Builder.buildPlainCFG(VPB2IRBB);
+  Builder.buildPlainCFG();
   return Plan;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
new file mode 100644
index 0000000000000..3dee5b3e42e6d
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -0,0 +1,257 @@
+//===-- VPlanPredicator.cpp - VPlan predicator ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements predication for VPlans.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPRecipeBuilder.h"
+#include "VPlan.h"
+#include "VPlanCFG.h"
+#include "VPlanTransforms.h"
+#include "VPlanUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+using namespace llvm;
+
+struct VPPredicator {
+  /// When we if-convert we need to create edge masks. We have to cache values
+  /// so that we don't end up with exponential recursion/IR. Note that
+  /// if-conversion currently takes place during VPlan-construction, so these
+  /// caches are only used at that stage.
+  using EdgeMaskCacheTy =
+      DenseMap<std::pair<VPBasicBlock *, VPBasicBlock *>, VPValue *>;
+  using BlockMaskCacheTy = DenseMap<VPBasicBlock *, VPValue *>;
+
+  VPPredicator(VPRecipeBuilder &RecipeBuilder) : RecipeBuilder(RecipeBuilder) {}
+
+  VPRecipeBuilder &RecipeBuilder;
+
+  VPBuilder Builder;
+  VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) {
+    assert(is_contained(Dst->getPredecessors(), Src) && "Invalid edge");
+
+    // Look for cached value.
+    VPValue *EdgeMask = RecipeBuilder.getEdgeMask(Src, Dst);
+    if (EdgeMask)
+      return EdgeMask;
+
+    VPValue *SrcMask = RecipeBuilder.getBlockInMask(Src);
+
+    // The terminator has to be a branch inst!
+    if (Src->empty() || Src->getNumSuccessors() == 1) {
+      RecipeBuilder.setEdgeMask(Src, Dst, SrcMask);
+      return SrcMask;
+    }
+
+    auto *Term = cast<VPInstruction>(Src->getTerminator());
+    if (Term->getOpcode() == Instruction::Switch) {
+      createSwitchEdgeMasks(Term);
+      return RecipeBuilder.getEdgeMask(Src, Dst);
+    }
+
+    auto *BI = cast<VPInstruction>(Src->getTerminator());
+    assert(BI->getOpcode() == VPInstruction::BranchOnCond);
+    if (Src->getSuccessors()[0] == Src->getSuccessors()[1]) {
+      RecipeBuilder.setEdgeMask(Src, Dst, SrcMask);
+      return SrcMask;
+    }
+
+    EdgeMask = BI->getOperand(0);
+    assert(EdgeMask && "No Edge Mask found for condition");
+
+    if (Src->getSuccessors()[0] != Dst)
+      EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
+
+    if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
+      // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
+      // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
+      // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
+      EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
+    }
+
+    RecipeBuilder.setEdgeMask(Src, Dst, EdgeMask);
+    return EdgeMask;
+  }
+
+  VPValue *createBlockInMask(VPBasicBlock *VPBB) {
+    Builder.setInsertPoint(VPBB, VPBB->begin());
+    // All-one mask is modelled as no-mask following the convention for masked
+    // load/store/gather/scatter. Initialize BlockMask to no-mask.
+    VPValue *BlockMask = nullptr;
+    // This is the block mask. We OR all unique incoming edges.
+    for (auto *Predecessor : SetVector<VPBlockBase *>(
+             VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
+      VPValue *EdgeMask = createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
+      if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is
+                       // too.
+        RecipeBuilder.setBlockInMask(VPBB, EdgeMask);
+        return EdgeMask;
+      }
+
+      if (!BlockMask) { // BlockMask has its initialized nullptr value.
+        BlockMask = EdgeMask;
+        continue;
+      }
+
+      BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
+    }
+
+    RecipeBuilder.setBlockInMask(VPBB, BlockMask);
+    return BlockMask;
+  }
+
+  void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) {
+    if (!FoldTail) {
+      RecipeBuilder.setBlockInMask(HeaderVPBB, nullptr);
+      return;
+    }
+
+    // Introduce the early-exit compare IV <= BTC to form header block mask.
+    // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+    // constructing the desired canonical IV in the header block as its first
+    // non-phi instructions.
+
+    auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
+    auto &Plan = *HeaderVPBB->getPlan();
+    auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
+    HeaderVPBB->insert(IV, NewInsertionPoint);
+
+    VPBuilder::InsertPointGuard Guard(Builder);
+    Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
+    VPValue *BlockMask = nullptr;
+    VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+    BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+    RecipeBuilder.setBlockInMask(HeaderVPBB, BlockMask);
+  }
+
+  void createSwitchEdgeMasks(VPInstruction *SI) {
+    VPBasicBlock *Src = SI->getParent();
+
+    // Create masks where the terminator in Src is a switch. We create mask for
+    // all edges at the same time. This is more efficient, as we can create and
+    // collect compares for all cases once.
+    VPValue *Cond = SI->getOperand(0);
+    VPBasicBlock *DefaultDst = cast<VPBasicBlock>(Src->getSuccessors()[0]);
+    MapVector<VPBasicBlock *, SmallVector<VPValue *>> Dst2Compares;
+    for (const auto &[Idx, Succ] :
+         enumerate(ArrayRef(Src->getSuccessors()).drop_front())) {
+      VPBasicBlock *Dst = cast<VPBasicBlock>(Succ);
+      // assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already
+      // created");
+      //  Cases whose destination is the same as default are redundant and can
+      //  be ignored - they will get there anyhow.
+      if (Dst == DefaultDst)
+        continue;
+      auto &Compares = Dst2Compares[Dst];
+      VPValue *V = SI->getOperand(Idx + 1);
+      Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
+    }
+
+    // We need to handle 2 separate cases below for all entries in Dst2Compares,
+    // which excludes destinations matching the default destination.
+    VPValue *SrcMask = RecipeBuilder.getBlockInMask(Src);
+    VPValue *DefaultMask = nullptr;
+    for (const auto &[Dst, Conds] : Dst2Compares) {
+      // 1. Dst is not the default destination. Dst is reached if any of the
+      // cases with destination == Dst are taken. Join the conditions for each
+      // case whose destination == Dst using an OR.
+      VPValue *Mask = Conds[0];
+      for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
+        Mask = Builder.createOr(Mask, V);
+      if (SrcMask)
+        Mask = Builder.createLogicalAnd(SrcMask, Mask);
+      RecipeBuilder.setEdgeMask(Src, Dst, Mask);
+
+      // 2. Create the mask for the default destination, which is reached if
+      // none of the cases with destination != default destination are taken.
+      // Join the conditions for each case where the destination is != Dst using
+      // an OR and negate it.
+      DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
+    }
+
+    if (DefaultMask) {
+      DefaultMask = Builder.createNot(DefaultMask);
+      if (SrcMask)
+        DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
+    }
+    RecipeBuilder.setEdgeMask(Src, DefaultDst, DefaultMask);
+  }
+};
+
+void VPlanTransforms::predicateAndLinearize(VPlan &Plan, bool FoldTail,
+                                            VPRecipeBuilder &RecipeBuilder) {
+  VPBlockBase *PrevVPBB = nullptr;
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  // Scan the body of the loop in a topological order to visit each basic block
+  // after having visited its predecessor basic blocks.
+  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Header);
+  VPPredicator Predicator(RecipeBuilder);
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    // Handle VPBBs down to the latch.
+    if (VPBB == LoopRegion->getExiting()) {
+      break;
+    }
+
+    if (VPBB == Header) {
+      Predicator.createHeaderMask(Header, FoldTail);
+      continue;
+    }
+    SmallVector<VPWidenPHIRecipe *> Phis;
+    for (VPRecipeBase &R : VPBB->phis())
+      Phis.push_back(cast<VPWidenPHIRecipe>(&R));
+
+    Predicator.createBlockInMask(VPBB);
+
+    for (VPWidenPHIRecipe *Phi : Phis) {
+      PHINode *IRPhi = cast<PHINode>(Phi->getUnderlyingValue());
+
+      unsigned NumIncoming = IRPhi->getNumIncomingValues();
+
+      // We know that all PHIs in non-header blocks are converted into selects,
+      // so we don't have to worry about the insertion order and we can just use
+      // the builder. At this point we generate the predication tree. There may
+      // be duplications since this is a simple recursive scan, but future
+      // optimizations will clean it up.
+
+      // Map incoming IR BasicBlocks to incoming VPValues, for lookup below.
+      // TODO: Add operands and masks in order from the VPlan predecessors.
+      DenseMap<BasicBlock *, VPValue *> VPIncomingValues;
+      DenseMap<BasicBlock *, VPBasicBlock *> VPIncomingBlocks;
+      for (const auto &[Idx, Pred] :
+           enumerate(predecessors(IRPhi->getParent()))) {
+        VPIncomingValues[Pred] = Phi->getOperand(Idx);
+        VPIncomingBlocks[Pred] =
+            cast<VPBasicBlock>(VPBB->getPredecessors()[Idx]);
+      }
+
+      SmallVector<VPValue *, 2> OperandsWithMask;
+      for (unsigned In = 0; In < NumIncoming; In++) {
+        BasicBlock *Pred = IRPhi->getIncomingBlock(In);
+        OperandsWithMask.push_back(VPIncomingValues.lookup(Pred));
+        VPValue *EdgeMask =
+            RecipeBuilder.getEdgeMask(VPIncomingBlocks.lookup(Pred), VPBB);
+        if (!EdgeMask) {
+          assert(In == 0 && "Both null and non-null edge masks found");
+          assert(all_equal(Phi->operands()) &&
+                 "Distinct incoming values with one having a full mask");
+          break;
+        }
+        OperandsWithMask.push_back(EdgeMask);
+      }
+      auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask);
+      Blend->insertBefore(Phi);
+      Phi->replaceAllUsesWith(Blend);
+      Phi->eraseFromParent();
+      RecipeBuilder.setRecipe(IRPhi, Blend);
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e2e20b4fda18e..5931b3248b915 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -52,9 +52,7 @@ struct VPlanTransforms {
       verifyVPlanIsValid(Plan);
   }
 
-  static std::unique_ptr<VPlan>
-  buildPlainCFG(Loop *TheLoop, LoopInfo &LI,
-                DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
+  static std::unique_ptr<VPlan> buildPlainCFG(Loop *TheLoop, LoopInfo &LI);
 
   /// Introduce the top-level VPRegionBlock for the main loop in \p Plan. Coming
   /// into this function, \p Plan's top-level loop is modeled using a plain CFG.
@@ -204,6 +202,9 @@ struct VPlanTransforms {
   /// candidates.
   static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
                                      unsigned VectorRegWidth);
+
+  static void predicateAndLinearize(VPlan &Plan, bool FoldTail,
+                                    VPRecipeBuilder &RecipeBuilder);
 };
 
 } // namespace llvm
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index 5b851499033a2..8b99ffe993871 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -70,8 +70,7 @@ class VPlanTestIRBase : public testing::Test {
 
     Loop *L = LI->getLoopFor(LoopHeader);
     PredicatedScalarEvolution PSE(*SE, *L);
-    DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
-    auto Plan = VPlanTransforms::buildPlainCFG(L, *LI, VPB2IRBB);
+    auto Plan = VPlanTransforms::buildPlainCFG(L, *LI);
     VPlanTransforms::introduceTopLevelVectorLoopRegion(
         *Plan, IntegerType::get(*Ctx, 64), PSE, true, false, L);
     return Plan;



More information about the llvm-commits mailing list