[libcxx-commits] [lld] [clang-tools-extra] [libcxx] [flang] [clang] [libc] [lldb] [compiler-rt] [llvm] [VPlan] Implement cloning of VPlans. (PR #73158)

Florian Hahn via libcxx-commits libcxx-commits at lists.llvm.org
Sat Jan 27 05:05:20 PST 2024


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/73158

>From 13a26e8e7440c3b501730b22588af393a3e543cd Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 6 Jul 2023 08:07:45 +0100
Subject: [PATCH 1/5] [VPlan] Implement cloning of VPlans.

This patch implements cloning for VPlans and recipes. Cloning is used in
the epilogue vectorization path, to clone the VPlan for the main vector
loop. This means we won't re-use a VPlan when executing the VPlan for
the epilogue vector loop, which in turn will enable us to perform
optimizations based on UF & VF.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   2 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 124 ++++++++++++
 llvm/lib/Transforms/Vectorize/VPlan.h         | 182 ++++++++++++++++++
 .../Transforms/Vectorize/VPlanTest.cpp        |   2 +
 4 files changed, 309 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 10c068e3b5895c2..9ffd44d59ffc6de 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10078,7 +10078,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
-        VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
+        VPlan &BestMainPlan = *LVP.getBestPlanFor(EPI.MainLoopVF).clone();
         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
             EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
         ++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b6e56c47c227f77..99b2a3bd59a64df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -615,6 +615,18 @@ void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+VPBlockBase *VPRegionBlock::clone() {
+  DenseMap<VPBlockBase *, VPBlockBase *> Old2New;
+  DenseMap<VPValue *, VPValue *> Old2NewVPValues;
+  VPBlockBase *NewEntry =
+      VPBlockUtils::cloneCFG(Entry, Old2New, Old2NewVPValues);
+  auto *NewR =
+      new VPRegionBlock(NewEntry, Old2New[Exiting], getName(), isReplicator());
+  for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
+    Block->setParent(NewR);
+  return NewR;
+}
+
 void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
   for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
     // Drop all references in VPBasicBlocks and replace all uses with
@@ -982,6 +994,65 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
 }
 
+static void remapVPValues(VPBasicBlock *OldBB, VPBasicBlock *NewBB,
+                          DenseMap<VPValue *, VPValue *> &Old2NewVPValues,
+                          bool Full = false) {
+  for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB)) {
+    for (unsigned I = 0, E = NewR.getNumOperands(); I != E; ++I) {
+      VPValue *NewOp = Old2NewVPValues.lookup(OldR.getOperand(I));
+      if (!Full)
+        continue;
+      NewR.setOperand(I, NewOp);
+    }
+    for (const auto &[OldV, NewV] :
+         zip(OldR.definedValues(), NewR.definedValues()))
+      Old2NewVPValues[OldV] = NewV;
+  }
+}
+
+VPlan *VPlan::clone() {
+  DenseMap<VPBlockBase *, VPBlockBase *> Old2New;
+  DenseMap<VPValue *, VPValue *> Old2NewVPValues;
+
+  auto *NewPlan = new VPlan();
+  SmallVector<VPValue *, 16> NewLiveIns;
+  for (VPValue *LI : VPLiveInsToFree) {
+    VPValue *NewLI = new VPValue(LI->getLiveInIRValue());
+    NewPlan->VPLiveInsToFree.push_back(NewLI);
+    Old2NewVPValues[LI] = NewLI;
+  }
+
+  Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
+  Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
+  if (BackedgeTakenCount) {
+    Old2NewVPValues[BackedgeTakenCount] = new VPValue();
+    NewPlan->BackedgeTakenCount = Old2NewVPValues[BackedgeTakenCount];
+  }
+
+  auto NewPH = cast<VPBasicBlock>(Preheader->clone());
+  remapVPValues(cast<VPBasicBlock>(Preheader), cast<VPBasicBlock>(NewPH),
+                Old2NewVPValues, /*Full*/ true);
+  VPValue *NewTC = Old2NewVPValues.lookup(TripCount);
+  if (!NewTC)
+    Old2NewVPValues[TripCount] = new VPValue(TripCount->getLiveInIRValue());
+  NewPlan->TripCount = Old2NewVPValues[TripCount];
+
+  auto *NewEntry = cast<VPBasicBlock>(VPBlockUtils::cloneCFG(
+      getEntry(), Old2New, Old2NewVPValues, /*FullRemapping*/ true));
+
+  NewPlan->Entry = NewEntry;
+  NewPlan->Preheader = NewPH;
+  NewEntry->setPlan(NewPlan);
+  NewPH->setPlan(NewPlan);
+  NewPlan->VFs = VFs;
+  NewPlan->UFs = UFs;
+  NewPlan->Name = Name;
+
+  for (const auto &[_, LO] : LiveOuts)
+    NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
+  return NewPlan;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
 Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
@@ -1200,6 +1271,59 @@ void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
 }
 #endif
 
+VPBlockBase *VPBlockUtils::cloneCFG(
+    VPBlockBase *Entry, DenseMap<VPBlockBase *, VPBlockBase *> &Old2New,
+    DenseMap<VPValue *, VPValue *> &Old2NewVPValues, bool FullRemapping) {
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Entry);
+  VPBlockBase *NewEntry = nullptr;
+  for (VPBlockBase *BB : RPOT) {
+    VPBlockBase *NewBB = BB->clone();
+    if (!NewEntry)
+      NewEntry = NewBB;
+
+    for (VPBlockBase *Pred : BB->getPredecessors())
+      connectBlocks(Old2New[Pred], NewBB);
+
+    Old2New[BB] = NewBB;
+
+    if (!isa<VPBasicBlock>(BB))
+      continue;
+  }
+
+  // Update the operands of all cloned recipes starting at NewEntry. This
+  // traverses all reachable blocks. This is done in two steps, to handle cycles
+  // in PHI recipes.
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
+      OldDeepRPOT(Entry);
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
+      NewDeepRPOT(NewEntry);
+  // First, collect all mappings from old to new VPValues defined by cloned
+  // recipes.
+  for (const auto &[OldBB, NewBB] :
+       zip(VPBlockUtils::blocksOnly<VPBasicBlock>(OldDeepRPOT),
+           VPBlockUtils::blocksOnly<VPBasicBlock>(NewDeepRPOT))) {
+    for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB))
+      for (const auto &[OldV, NewV] :
+           zip(OldR.definedValues(), NewR.definedValues()))
+        Old2NewVPValues[OldV] = NewV;
+  }
+
+  // Update all operands to use cloned VPValues.
+  for (VPBasicBlock *NewBB :
+       VPBlockUtils::blocksOnly<VPBasicBlock>(NewDeepRPOT)) {
+    for (VPRecipeBase &NewR : *NewBB)
+      for (unsigned I = 0, E = NewR.getNumOperands(); I != E; ++I) {
+        VPValue *NewOp = Old2NewVPValues.lookup(NewR.getOperand(I));
+        if (!FullRemapping)
+          continue;
+        NewR.setOperand(I, NewOp);
+      }
+  }
+
+  return NewEntry;
+}
+
 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
                                           Old2NewTy &Old2New,
                                           InterleavedAccessInfo &IAI) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9d279da758ec00d..2367daf4d31aa9d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -478,6 +478,8 @@ class VPBlockBase {
 
   using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
 
+  virtual VPBlockBase *clone() = 0;
+
   virtual ~VPBlockBase() = default;
 
   const std::string &getName() const { return Name; }
@@ -725,6 +727,9 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
       : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {}
   virtual ~VPRecipeBase() = default;
 
+  /// Clone the current recipe.
+  virtual VPRecipeBase *clone() = 0;
+
   /// \return the VPBasicBlock which this VPRecipe belongs to.
   VPBasicBlock *getParent() { return Parent; }
   const VPBasicBlock *getParent() const { return Parent; }
@@ -880,6 +885,12 @@ class VPRecipeWithIRFlags : public VPRecipeBase {
     unsigned AllFlags;
   };
 
+protected:
+  void transferFlags(VPRecipeWithIRFlags &Other) {
+    OpType = Other.OpType;
+    AllFlags = Other.AllFlags;
+  }
+
 public:
   template <typename IterT>
   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {})
@@ -1110,6 +1121,13 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
 
   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
 
+  VPRecipeBase *clone() override {
+    SmallVector<VPValue *, 2> Operands(operands());
+    auto *New = new VPInstruction(Opcode, Operands, getDebugLoc(), Name);
+    New->transferFlags(*this);
+    return New;
+  }
+
   unsigned getOpcode() const { return Opcode; }
 
   /// Generate the instruction.
@@ -1204,6 +1222,12 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
 
   ~VPWidenRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands());
+    R->transferFlags(*this);
+    return R;
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenSC)
 
   /// Produce widened copies of all Ingredients.
@@ -1243,6 +1267,14 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPValue {
 
   ~VPWidenCastRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    if (auto *UV = getUnderlyingValue())
+      return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy,
+                                   *cast<CastInst>(UV));
+
+    return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)
 
   /// Produce widened copies of the cast.
@@ -1281,6 +1313,11 @@ class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
 
   ~VPWidenCallRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPWidenCallRecipe(*cast<CallInst>(getUnderlyingInstr()),
+                                 operands(), VectorIntrinsicID, Variant);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
 
   /// Produce a widened version of the call instruction.
@@ -1302,6 +1339,11 @@ struct VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
 
   ~VPWidenSelectRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()),
+                                   operands());
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
 
   /// Produce a widened version of the select instruction.
@@ -1346,6 +1388,11 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue {
 
   ~VPWidenGEPRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPWidenGEPRecipe(cast<GetElementPtrInst>(getUnderlyingInstr()),
+                                operands());
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
 
   /// Generate the gep nodes.
@@ -1381,6 +1428,11 @@ class VPVectorPointerRecipe : public VPRecipeBase, public VPValue {
     return true;
   }
 
+  VPRecipeBase *clone() override {
+    return new VPVectorPointerRecipe(getOperand(0), IndexedTy, IsReverse,
+                                     getDebugLoc());
+  }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1491,6 +1543,11 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
 
   ~VPWidenIntOrFpInductionRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPWidenIntOrFpInductionRecipe(IV, getStartValue(),
+                                             getStepValue(), IndDesc, Trunc);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
 
   /// Generate the vectorized and scalarized versions of the phi node as
@@ -1561,6 +1618,12 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
 
   ~VPWidenPointerInductionRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPWidenPointerInductionRecipe(
+        cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1),
+        IndDesc, IsScalarAfterVectorization);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC)
 
   /// Generate vector values for the pointer induction.
@@ -1594,6 +1657,13 @@ class VPWidenPHIRecipe : public VPHeaderPHIRecipe {
       addOperand(Start);
   }
 
+  VPRecipeBase *clone() override {
+    auto *Res = new VPWidenPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+                                     getOperand(0));
+    Res->IncomingBlocks = IncomingBlocks;
+    return Res;
+  }
+
   ~VPWidenPHIRecipe() override = default;
 
   VP_CLASSOF_IMPL(VPDef::VPWidenPHISC)
@@ -1633,6 +1703,11 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
     return R->getVPDefID() == VPDef::VPFirstOrderRecurrencePHISC;
   }
 
+  VPRecipeBase *clone() override {
+    return new VPFirstOrderRecurrencePHIRecipe(
+        cast<PHINode>(getUnderlyingInstr()), *getOperand(0));
+  }
+
   void execute(VPTransformState &State) override;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1668,6 +1743,14 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
 
   ~VPReductionPHIRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    auto *R =
+        new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
+                                 *getOperand(0), IsInLoop, IsOrdered);
+    R->addOperand(getBackedgeValue());
+    return R;
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPReductionPHISC)
 
   static inline bool classof(const VPHeaderPHIRecipe *R) {
@@ -1710,6 +1793,11 @@ class VPBlendRecipe : public VPRecipeBase, public VPValue {
            "of operands");
   }
 
+  VPRecipeBase *clone() override {
+    SmallVector<VPValue *> Ops(operands());
+    return new VPBlendRecipe(cast<PHINode>(getUnderlyingValue()), Ops);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPBlendSC)
 
   /// Return the number of incoming values, taking into account that a single
@@ -1779,6 +1867,11 @@ class VPInterleaveRecipe : public VPRecipeBase {
   }
   ~VPInterleaveRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
+                                  NeedsMaskForGaps);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
 
   /// Return the address accessed by this recipe.
@@ -1845,6 +1938,11 @@ class VPReductionRecipe : public VPRecipeBase, public VPValue {
 
   ~VPReductionRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(),
+                                 getVecOp(), getCondOp());
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPReductionSC)
 
   /// Generate the reduction in the loop
@@ -1889,6 +1987,11 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags, public VPValue {
 
   ~VPReplicateRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsUniform,
+                                 isPredicated() ? getMask() : nullptr);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPReplicateSC)
 
   /// Generate replicas of the desired Ingredient. Replicas will be generated
@@ -1941,6 +2044,10 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
       addOperand(BlockInMask);
   }
 
+  VPRecipeBase *clone() override {
+    return new VPBranchOnMaskRecipe(getOperand(0));
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPBranchOnMaskSC)
 
   /// Generate the extraction of the appropriate bit from the block mask and the
@@ -1988,6 +2095,10 @@ class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue {
       : VPRecipeBase(VPDef::VPPredInstPHISC, PredV), VPValue(this) {}
   ~VPPredInstPHIRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPPredInstPHIRecipe(getOperand(0));
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC)
 
   /// Generates phi nodes for live-outs as needed to retain SSA form.
@@ -2051,6 +2162,16 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
     setMask(Mask);
   }
 
+  VPRecipeBase *clone() override {
+    if (isStore())
+      return new VPWidenMemoryInstructionRecipe(
+          cast<StoreInst>(Ingredient), getAddr(), getStoredValue(), getMask(),
+          Consecutive, Reverse);
+
+    return new VPWidenMemoryInstructionRecipe(
+        cast<LoadInst>(Ingredient), getAddr(), getMask(), Consecutive, Reverse);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC)
 
   /// Return the address accessed by this recipe.
@@ -2117,6 +2238,8 @@ class VPExpandSCEVRecipe : public VPRecipeBase, public VPValue {
 
   ~VPExpandSCEVRecipe() override = default;
 
+  VPRecipeBase *clone() override { return new VPExpandSCEVRecipe(Expr, SE); }
+
   VP_CLASSOF_IMPL(VPDef::VPExpandSCEVSC)
 
   /// Generate a canonical vector induction variable of the vector loop, with
@@ -2142,6 +2265,12 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
 
   ~VPCanonicalIVPHIRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc());
+    R->addOperand(getBackedgeValue());
+    return R;
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPCanonicalIVPHISC)
 
   static inline bool classof(const VPHeaderPHIRecipe *D) {
@@ -2195,6 +2324,10 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
 
   ~VPActiveLaneMaskPHIRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
 
   static inline bool classof(const VPHeaderPHIRecipe *D) {
@@ -2220,6 +2353,11 @@ class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
 
   ~VPWidenCanonicalIVRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPWidenCanonicalIVRecipe(
+        cast<VPCanonicalIVPHIRecipe>(getOperand(0)));
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenCanonicalIVSC)
 
   /// Generate a canonical vector induction variable of the vector loop, with
@@ -2262,9 +2400,22 @@ class VPDerivedIVRecipe : public VPRecipeBase, public VPValue {
         VPValue(this), TruncResultTy(TruncResultTy), Kind(IndDesc.getKind()),
         FPBinOp(dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())) {
   }
+  VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
+                    const FPMathOperator *FPBinOp, VPValue *Start,
+                    VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
+                    Type *TruncResultTy)
+      : VPRecipeBase(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
+        VPValue(this), TruncResultTy(TruncResultTy), Kind(Kind),
+        FPBinOp(FPBinOp) {}
 
   ~VPDerivedIVRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPDerivedIVRecipe(Kind, FPBinOp, getOperand(0),
+                                 cast<VPCanonicalIVPHIRecipe>(getOperand(1)),
+                                 getOperand(2), TruncResultTy);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC)
 
   /// Generate the transformed value of the induction at offset StartValue (1.
@@ -2316,6 +2467,12 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, public VPValue {
 
   ~VPScalarIVStepsRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPScalarIVStepsRecipe(
+        getOperand(0), getOperand(1), InductionOpcode,
+        hasFastMathFlags() ? getFastMathFlags() : FastMathFlags());
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC)
 
   /// Generate the scalarized versions of the phi node as needed by their users.
@@ -2360,6 +2517,13 @@ class VPBasicBlock : public VPBlockBase {
       Recipes.pop_back();
   }
 
+  VPBlockBase *clone() override {
+    auto *NewBlock = new VPBasicBlock(getName());
+    for (VPRecipeBase &R : *this)
+      NewBlock->appendRecipe(R.clone());
+    return NewBlock;
+  }
+
   /// Instruction iterators...
   using iterator = RecipeListTy::iterator;
   using const_iterator = RecipeListTy::const_iterator;
@@ -2498,6 +2662,8 @@ class VPRegionBlock : public VPBlockBase {
     }
   }
 
+  VPBlockBase *clone() override;
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPBlockBase *V) {
     return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
@@ -2618,6 +2784,9 @@ class VPlan {
   /// been modeled in VPlan directly.
   DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
 
+  /// Construct an uninitialized VPlan, should be used for cloning only.
+  explicit VPlan() = default;
+
 public:
   /// Construct a VPlan with original preheader \p Preheader, trip count \p TC
   /// and \p Entry to the plan. At the moment, \p Preheader and \p Entry need to
@@ -2807,6 +2976,8 @@ class VPlan {
   VPBasicBlock *getPreheader() { return Preheader; }
   const VPBasicBlock *getPreheader() const { return Preheader; }
 
+  VPlan *clone();
+
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
@@ -2969,6 +3140,17 @@ class VPBlockUtils {
       return cast<BlockTy>(&Block);
     });
   }
+
+  /// Clone the CFG for all nodes reachable from \p Entry, this includes cloning
+  /// the blocks and their recipes. Operands of cloned recipes will be updated
+  /// to use new VPValues from \p Old2NewValues. If \p FullRemapping is set to
+  /// true, then all old VPValues from outside the cloned nodes must be mapped
+  /// in \p Old2NewValues.
+  static VPBlockBase *
+  cloneCFG(VPBlockBase *Entry,
+           DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewBBs,
+           DenseMap<VPValue *, VPValue *> &Old2NewValues,
+           bool FullRemapping = false);
 };
 
 class VPInterleavedAccessInfo {
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 65d241feeab2fca..33d5e2759af5909 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1299,6 +1299,8 @@ struct VPDoubleValueDef : public VPRecipeBase {
     new VPValue(nullptr, this);
   }
 
+  VPRecipeBase *clone() override { return nullptr; }
+
   void execute(struct VPTransformState &State) override {}
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void print(raw_ostream &O, const Twine &Indent,

>From 945e4fe79f0e962be179fb9907a20e91ec889bac Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 21 Jan 2024 10:26:12 +0000
Subject: [PATCH 2/5] !fixup address latest comments, update on top of current
 main.

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 209 +++++++++++++-----------
 llvm/lib/Transforms/Vectorize/VPlan.h   |  58 +++----
 2 files changed, 134 insertions(+), 133 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 99b2a3bd59a64df..b4b4d7ac00733f4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -614,17 +614,60 @@ void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
   printSuccessors(O, Indent);
 }
 #endif
+static void cloneCFG(VPBlockBase *Entry,
+                     DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewVPBlocks);
+
+static VPBlockBase *cloneVPB(VPBlockBase *BB) {
+  if (auto *VPBB = dyn_cast<VPBasicBlock>(BB)) {
+    auto *NewBlock = new VPBasicBlock(VPBB->getName());
+    for (VPRecipeBase &R : *VPBB)
+      NewBlock->appendRecipe(R.clone());
+    return NewBlock;
+  }
 
-VPBlockBase *VPRegionBlock::clone() {
-  DenseMap<VPBlockBase *, VPBlockBase *> Old2New;
+  auto *VPR = cast<VPRegionBlock>(BB);
+  DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
   DenseMap<VPValue *, VPValue *> Old2NewVPValues;
-  VPBlockBase *NewEntry =
-      VPBlockUtils::cloneCFG(Entry, Old2New, Old2NewVPValues);
-  auto *NewR =
-      new VPRegionBlock(NewEntry, Old2New[Exiting], getName(), isReplicator());
+  cloneCFG(VPR->getEntry(), Old2NewVPBlocks);
+  VPBlockBase *NewEntry = Old2NewVPBlocks[VPR->getEntry()];
+  auto *NewRegion =
+      new VPRegionBlock(NewEntry, Old2NewVPBlocks[VPR->getExiting()],
+                        VPR->getName(), VPR->isReplicator());
   for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
-    Block->setParent(NewR);
-  return NewR;
+    Block->setParent(NewRegion);
+  return NewRegion;
+}
+
+// Clone the CFG for all nodes reachable from \p Entry, this includes cloning
+// the blocks and their recipes. Operands of cloned recipes will NOT be updated.
+// Remapping of operands must be done separately.
+static void cloneCFG(VPBlockBase *Entry,
+                     DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewVPBlocks) {
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Entry);
+  for (VPBlockBase *BB : RPOT) {
+    VPBlockBase *NewBB = cloneVPB(BB);
+    for (VPBlockBase *Pred : BB->getPredecessors())
+      VPBlockUtils::connectBlocks(Old2NewVPBlocks[Pred], NewBB);
+
+    Old2NewVPBlocks[BB] = NewBB;
+  }
+
+#if !defined(NDEBUG)
+  // Verify that the order of predecessors and successors matches in the cloned
+  // version.
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+      NewRPOT(Old2NewVPBlocks[Entry]);
+  for (const auto &[OldBB, NewBB] : zip(RPOT, NewRPOT)) {
+    for (const auto &[OldPred, NewPred] :
+         zip(OldBB->getPredecessors(), NewBB->getPredecessors()))
+      assert(NewPred == Old2NewVPBlocks[OldPred] && "Different predecessors");
+
+    for (const auto &[OldSucc, NewSucc] :
+         zip(OldBB->successors(), NewBB->successors()))
+      assert(NewSucc == Old2NewVPBlocks[OldSucc] && "Different successors");
+  }
+#endif
 }
 
 void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
@@ -994,62 +1037,91 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
 }
 
-static void remapVPValues(VPBasicBlock *OldBB, VPBasicBlock *NewBB,
-                          DenseMap<VPValue *, VPValue *> &Old2NewVPValues,
-                          bool Full = false) {
-  for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB)) {
-    for (unsigned I = 0, E = NewR.getNumOperands(); I != E; ++I) {
-      VPValue *NewOp = Old2NewVPValues.lookup(OldR.getOperand(I));
-      if (!Full)
-        continue;
-      NewR.setOperand(I, NewOp);
+static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
+                          DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
+  // Update the operands of all cloned recipes starting at NewEntry. This
+  // traverses all reachable blocks. This is done in two steps, to handle cycles
+  // in PHI recipes.
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
+      OldDeepRPOT(Entry);
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
+      NewDeepRPOT(NewEntry);
+  // First, collect all mappings from old to new VPValues defined by cloned
+  // recipes.
+  for (const auto &[OldBB, NewBB] :
+       zip(VPBlockUtils::blocksOnly<VPBasicBlock>(OldDeepRPOT),
+           VPBlockUtils::blocksOnly<VPBasicBlock>(NewDeepRPOT))) {
+    assert(OldBB->getRecipeList().size() == NewBB->getRecipeList().size() &&
+           "blocks must have the same number of recipes");
+
+    for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB)) {
+      assert(OldR.getNumOperands() == NewR.getNumOperands() &&
+             "recipes must have the same number of operands");
+      assert(OldR.getNumDefinedValues() == NewR.getNumDefinedValues() &&
+             "recipes must define the same number of operands");
+      for (const auto &[OldV, NewV] :
+           zip(OldR.definedValues(), NewR.definedValues()))
+        Old2NewVPValues[OldV] = NewV;
     }
-    for (const auto &[OldV, NewV] :
-         zip(OldR.definedValues(), NewR.definedValues()))
-      Old2NewVPValues[OldV] = NewV;
+  }
+
+  // Update all operands to use cloned VPValues.
+  for (VPBasicBlock *NewBB :
+       VPBlockUtils::blocksOnly<VPBasicBlock>(NewDeepRPOT)) {
+    for (VPRecipeBase &NewR : *NewBB)
+      for (unsigned I = 0, E = NewR.getNumOperands(); I != E; ++I) {
+        VPValue *NewOp = Old2NewVPValues.lookup(NewR.getOperand(I));
+        NewR.setOperand(I, NewOp);
+      }
   }
 }
 
 VPlan *VPlan::clone() {
-  DenseMap<VPBlockBase *, VPBlockBase *> Old2New;
+  DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
   DenseMap<VPValue *, VPValue *> Old2NewVPValues;
 
   auto *NewPlan = new VPlan();
+
+  // Clone live-ins.
   SmallVector<VPValue *, 16> NewLiveIns;
-  for (VPValue *LI : VPLiveInsToFree) {
-    VPValue *NewLI = new VPValue(LI->getLiveInIRValue());
-    NewPlan->VPLiveInsToFree.push_back(NewLI);
-    Old2NewVPValues[LI] = NewLI;
+  for (VPValue *OldLiveIn : VPLiveInsToFree) {
+    VPValue *NewLiveIn = new VPValue(OldLiveIn->getLiveInIRValue());
+    NewPlan->VPLiveInsToFree.push_back(NewLiveIn);
+    Old2NewVPValues[OldLiveIn] = NewLiveIn;
   }
-
   Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
   Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
   if (BackedgeTakenCount) {
-    Old2NewVPValues[BackedgeTakenCount] = new VPValue();
-    NewPlan->BackedgeTakenCount = Old2NewVPValues[BackedgeTakenCount];
+    NewPlan->BackedgeTakenCount = new VPValue();
+    Old2NewVPValues[BackedgeTakenCount] = NewPlan->BackedgeTakenCount;
   }
-
-  auto NewPH = cast<VPBasicBlock>(Preheader->clone());
-  remapVPValues(cast<VPBasicBlock>(Preheader), cast<VPBasicBlock>(NewPH),
-                Old2NewVPValues, /*Full*/ true);
-  VPValue *NewTC = Old2NewVPValues.lookup(TripCount);
-  if (!NewTC)
+  assert(TripCount && "trip count must be set");
+  if (TripCount->isLiveIn())
     Old2NewVPValues[TripCount] = new VPValue(TripCount->getLiveInIRValue());
-  NewPlan->TripCount = Old2NewVPValues[TripCount];
 
-  auto *NewEntry = cast<VPBasicBlock>(VPBlockUtils::cloneCFG(
-      getEntry(), Old2New, Old2NewVPValues, /*FullRemapping*/ true));
+  // Clone blocks.
+  cloneCFG(Preheader, Old2NewVPBlocks);
+  cloneCFG(getEntry(), Old2NewVPBlocks);
+
+  auto *NewPreheader = cast<VPBasicBlock>(Old2NewVPBlocks[Preheader]);
+  remapOperands(Preheader, NewPreheader, Old2NewVPValues);
+  auto *NewEntry = cast<VPBasicBlock>(Old2NewVPBlocks[Entry]);
+  remapOperands(Entry, NewEntry, Old2NewVPValues);
+
+  // Clone live-outs.
+  for (const auto &[_, LO] : LiveOuts)
+    NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
 
+  // Initialize fields of cloned VPlan.
   NewPlan->Entry = NewEntry;
-  NewPlan->Preheader = NewPH;
+  NewPlan->Preheader = NewPreheader;
   NewEntry->setPlan(NewPlan);
-  NewPH->setPlan(NewPlan);
+  NewPreheader->setPlan(NewPlan);
   NewPlan->VFs = VFs;
   NewPlan->UFs = UFs;
+  // TODO: Adjust names.
   NewPlan->Name = Name;
-
-  for (const auto &[_, LO] : LiveOuts)
-    NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
+  NewPlan->TripCount = Old2NewVPValues[TripCount];
   return NewPlan;
 }
 
@@ -1271,59 +1343,6 @@ void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
 }
 #endif
 
-VPBlockBase *VPBlockUtils::cloneCFG(
-    VPBlockBase *Entry, DenseMap<VPBlockBase *, VPBlockBase *> &Old2New,
-    DenseMap<VPValue *, VPValue *> &Old2NewVPValues, bool FullRemapping) {
-  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
-      Entry);
-  VPBlockBase *NewEntry = nullptr;
-  for (VPBlockBase *BB : RPOT) {
-    VPBlockBase *NewBB = BB->clone();
-    if (!NewEntry)
-      NewEntry = NewBB;
-
-    for (VPBlockBase *Pred : BB->getPredecessors())
-      connectBlocks(Old2New[Pred], NewBB);
-
-    Old2New[BB] = NewBB;
-
-    if (!isa<VPBasicBlock>(BB))
-      continue;
-  }
-
-  // Update the operands of all cloned recipes starting at NewEntry. This
-  // traverses all reachable blocks. This is done in two steps, to handle cycles
-  // in PHI recipes.
-  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
-      OldDeepRPOT(Entry);
-  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
-      NewDeepRPOT(NewEntry);
-  // First, collect all mappings from old to new VPValues defined by cloned
-  // recipes.
-  for (const auto &[OldBB, NewBB] :
-       zip(VPBlockUtils::blocksOnly<VPBasicBlock>(OldDeepRPOT),
-           VPBlockUtils::blocksOnly<VPBasicBlock>(NewDeepRPOT))) {
-    for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB))
-      for (const auto &[OldV, NewV] :
-           zip(OldR.definedValues(), NewR.definedValues()))
-        Old2NewVPValues[OldV] = NewV;
-  }
-
-  // Update all operands to use cloned VPValues.
-  for (VPBasicBlock *NewBB :
-       VPBlockUtils::blocksOnly<VPBasicBlock>(NewDeepRPOT)) {
-    for (VPRecipeBase &NewR : *NewBB)
-      for (unsigned I = 0, E = NewR.getNumOperands(); I != E; ++I) {
-        VPValue *NewOp = Old2NewVPValues.lookup(NewR.getOperand(I));
-        if (!FullRemapping)
-          continue;
-        NewR.setOperand(I, NewOp);
-      }
-  }
-
-  return NewEntry;
-}
-
 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
                                           Old2NewTy &Old2New,
                                           InterleavedAccessInfo &IAI) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 630684488e3795e..f5a38c0ebb29986 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -480,8 +480,6 @@ class VPBlockBase {
 
   using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
 
-  virtual VPBlockBase *clone() = 0;
-
   virtual ~VPBlockBase() = default;
 
   const std::string &getName() const { return Name; }
@@ -1393,7 +1391,8 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
 
   VPRecipeBase *clone() override {
     return new VPWidenCallRecipe(*cast<CallInst>(getUnderlyingInstr()),
-                                 operands(), VectorIntrinsicID, Variant);
+                                 operands(), VectorIntrinsicID, getDebugLoc(),
+                                 Variant);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
@@ -1508,7 +1507,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
 
   VPRecipeBase *clone() override {
     return new VPVectorPointerRecipe(getOperand(0), IndexedTy, IsReverse,
-                                     getDebugLoc());
+                                     isInBounds(), getDebugLoc());
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2468,28 +2467,28 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
   /// for floating point inductions.
   const FPMathOperator *FPBinOp;
 
-public:
-  VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
+  VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
+                    const FPMathOperator *FPBinOp, VPValue *Start,
                     VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
                     Type *TruncResultTy)
       : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
-        TruncResultTy(TruncResultTy), Kind(IndDesc.getKind()),
-        FPBinOp(dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())) {
-  }
-  VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
-                    const FPMathOperator *FPBinOp, VPValue *Start,
+        TruncResultTy(TruncResultTy), Kind(Kind), FPBinOp(FPBinOp) {}
+
+public:
+  VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
                     VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
                     Type *TruncResultTy)
-      : VPRecipeBase(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
-        VPValue(this), TruncResultTy(TruncResultTy), Kind(Kind),
-        FPBinOp(FPBinOp) {}
+      : VPDerivedIVRecipe(
+            IndDesc.getKind(),
+            dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()),
+            Start, CanonicalIV, Step, TruncResultTy) {}
 
   ~VPDerivedIVRecipe() override = default;
 
   VPRecipeBase *clone() override {
-    return new VPDerivedIVRecipe(Kind, FPBinOp, getOperand(0),
-                                 cast<VPCanonicalIVPHIRecipe>(getOperand(1)),
-                                 getOperand(2), TruncResultTy);
+    return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(),
+                                 getCanonicalIV(), getStepValue(),
+                                 TruncResultTy);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC)
@@ -2510,7 +2509,9 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
   }
 
   VPValue *getStartValue() const { return getOperand(0); }
-  VPValue *getCanonicalIV() const { return getOperand(1); }
+  VPCanonicalIVPHIRecipe *getCanonicalIV() const {
+    return cast<VPCanonicalIVPHIRecipe>(getOperand(1));
+  }
   VPValue *getStepValue() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
@@ -2593,13 +2594,6 @@ class VPBasicBlock : public VPBlockBase {
       Recipes.pop_back();
   }
 
-  VPBlockBase *clone() override {
-    auto *NewBlock = new VPBasicBlock(getName());
-    for (VPRecipeBase &R : *this)
-      NewBlock->appendRecipe(R.clone());
-    return NewBlock;
-  }
-
   /// Instruction iterators...
   using iterator = RecipeListTy::iterator;
   using const_iterator = RecipeListTy::const_iterator;
@@ -2738,8 +2732,6 @@ class VPRegionBlock : public VPBlockBase {
     }
   }
 
-  VPBlockBase *clone() override;
-
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPBlockBase *V) {
     return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
@@ -3052,6 +3044,7 @@ class VPlan {
   VPBasicBlock *getPreheader() { return Preheader; }
   const VPBasicBlock *getPreheader() const { return Preheader; }
 
+  /// Clone the current VPlan and return it.
   VPlan *clone();
 
 private:
@@ -3216,17 +3209,6 @@ class VPBlockUtils {
       return cast<BlockTy>(&Block);
     });
   }
-
-  /// Clone the CFG for all nodes reachable from \p Entry, this includes cloning
-  /// the blocks and their recipes. Operands of cloned recipes will be updated
-  /// to use new VPValues from \p Old2NewValues. If \p FullRemapping is set to
-  /// true, then all old VPValues from outside the cloned nodes must be mapped
-  /// in \p Old2NewValues.
-  static VPBlockBase *
-  cloneCFG(VPBlockBase *Entry,
-           DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewBBs,
-           DenseMap<VPValue *, VPValue *> &Old2NewValues,
-           bool FullRemapping = false);
 };
 
 class VPInterleavedAccessInfo {

>From 5a845863da584d6810bc3706397c782e3cd4d395 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 25 Jan 2024 20:51:12 +0000
Subject: [PATCH 3/5] !fixup address latest comments, thanks!

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 52 ++++++++++++-------------
 llvm/lib/Transforms/Vectorize/VPlan.h   |  8 +---
 2 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b4b4d7ac00733f4..ac51dfc76f19011 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -614,8 +614,7 @@ void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
   printSuccessors(O, Indent);
 }
 #endif
-static void cloneCFG(VPBlockBase *Entry,
-                     DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewVPBlocks);
+static std::pair<VPBlockBase *, VPBlockBase *> cloneSESE(VPBlockBase *Entry);
 
 static VPBlockBase *cloneVPB(VPBlockBase *BB) {
   if (auto *VPBB = dyn_cast<VPBasicBlock>(BB)) {
@@ -626,23 +625,21 @@ static VPBlockBase *cloneVPB(VPBlockBase *BB) {
   }
 
   auto *VPR = cast<VPRegionBlock>(BB);
-  DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
-  DenseMap<VPValue *, VPValue *> Old2NewVPValues;
-  cloneCFG(VPR->getEntry(), Old2NewVPBlocks);
-  VPBlockBase *NewEntry = Old2NewVPBlocks[VPR->getEntry()];
-  auto *NewRegion =
-      new VPRegionBlock(NewEntry, Old2NewVPBlocks[VPR->getExiting()],
-                        VPR->getName(), VPR->isReplicator());
+  const auto &[NewEntry, NewExiting] = cloneSESE(VPR->getEntry());
+  auto *NewRegion = new VPRegionBlock(NewEntry, NewExiting, VPR->getName(),
+                                      VPR->isReplicator());
   for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
     Block->setParent(NewRegion);
   return NewRegion;
 }
 
-// Clone the CFG for all nodes reachable from \p Entry, this includes cloning
-// the blocks and their recipes. Operands of cloned recipes will NOT be updated.
-// Remapping of operands must be done separately.
-static void cloneCFG(VPBlockBase *Entry,
-                     DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewVPBlocks) {
+// Clone the CFG for all nodes in the single-entry-single-exit region reachable
+// from \p Entry, this includes cloning the blocks and their recipes. Operands
+// of cloned recipes will NOT be updated. Remapping of operands must be done
+// separately. Returns a pair with the the new entry and exiting blocks of the
+// cloned region.
+static std::pair<VPBlockBase *, VPBlockBase *> cloneSESE(VPBlockBase *Entry) {
+  DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Entry);
   for (VPBlockBase *BB : RPOT) {
@@ -668,6 +665,9 @@ static void cloneCFG(VPBlockBase *Entry,
       assert(NewSucc == Old2NewVPBlocks[OldSucc] && "Different successors");
   }
 #endif
+
+  return std::make_pair(Old2NewVPBlocks[Entry],
+                        Old2NewVPBlocks[*reverse(RPOT).begin()]);
 }
 
 void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
@@ -1077,13 +1077,15 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
 }
 
 VPlan *VPlan::clone() {
-  DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
   DenseMap<VPValue *, VPValue *> Old2NewVPValues;
 
-  auto *NewPlan = new VPlan();
+  // Clone blocks.
+  VPBlockBase *NewPreheader = cloneVPB(Preheader);
+  const auto &[NewEntry, __] = cloneSESE(getEntry());
 
-  // Clone live-ins.
-  SmallVector<VPValue *, 16> NewLiveIns;
+  // Create VPlan, clone live-ins and remap operands in the cloned blocks.
+  auto *NewPlan =
+      new VPlan(cast<VPBasicBlock>(NewPreheader), cast<VPBasicBlock>(NewEntry));
   for (VPValue *OldLiveIn : VPLiveInsToFree) {
     VPValue *NewLiveIn = new VPValue(OldLiveIn->getLiveInIRValue());
     NewPlan->VPLiveInsToFree.push_back(NewLiveIn);
@@ -1098,29 +1100,25 @@ VPlan *VPlan::clone() {
   assert(TripCount && "trip count must be set");
   if (TripCount->isLiveIn())
     Old2NewVPValues[TripCount] = new VPValue(TripCount->getLiveInIRValue());
+  // else NewTripCount will be created and inserted into Old2NewVPValues when
+  // TripCount is cloned. In any case NewPlan->TripCount is updated below.
 
-  // Clone blocks.
-  cloneCFG(Preheader, Old2NewVPBlocks);
-  cloneCFG(getEntry(), Old2NewVPBlocks);
-
-  auto *NewPreheader = cast<VPBasicBlock>(Old2NewVPBlocks[Preheader]);
   remapOperands(Preheader, NewPreheader, Old2NewVPValues);
-  auto *NewEntry = cast<VPBasicBlock>(Old2NewVPBlocks[Entry]);
   remapOperands(Entry, NewEntry, Old2NewVPValues);
 
   // Clone live-outs.
   for (const auto &[_, LO] : LiveOuts)
     NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
 
-  // Initialize fields of cloned VPlan.
-  NewPlan->Entry = NewEntry;
-  NewPlan->Preheader = NewPreheader;
+  // Initialize remaining fields of cloned VPlan.
   NewEntry->setPlan(NewPlan);
   NewPreheader->setPlan(NewPlan);
   NewPlan->VFs = VFs;
   NewPlan->UFs = UFs;
   // TODO: Adjust names.
   NewPlan->Name = Name;
+  assert(Old2NewVPValues.contains(TripCount) &&
+         "TripCount must have been added to Old2NewVPValues");
   NewPlan->TripCount = Old2NewVPValues[TripCount];
   return NewPlan;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f5a38c0ebb29986..55912361d692e8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1735,10 +1735,7 @@ class VPWidenPHIRecipe : public VPHeaderPHIRecipe {
   }
 
   VPRecipeBase *clone() override {
-    auto *Res = new VPWidenPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
-                                     getOperand(0));
-    Res->IncomingBlocks = IncomingBlocks;
-    return Res;
+    llvm_unreachable("cloning not implemented yet");
   }
 
   ~VPWidenPHIRecipe() override = default;
@@ -2852,9 +2849,6 @@ class VPlan {
   /// been modeled in VPlan directly.
   DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
 
-  /// Construct an uninitialized VPlan, should be used for cloning only.
-  explicit VPlan() = default;
-
 public:
   /// Construct a VPlan with original preheader \p Preheader, trip count \p TC
   /// and \p Entry to the plan. At the moment, \p Preheader and \p Entry need to

>From 9b1f6b3c0fb0feed29fede8d5f43b600de87fe31 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 26 Jan 2024 11:16:04 +0000
Subject: [PATCH 4/5] !fixup address latest comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  2 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 44 +++++++------------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 23 +++++++++-
 3 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a646d063d0ce425..0843718800e5b4d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10097,7 +10097,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
-        VPlan &BestMainPlan = *LVP.getBestPlanFor(EPI.MainLoopVF).clone();
+        VPlan &BestMainPlan = *LVP.getBestPlanFor(EPI.MainLoopVF).duplicate();
         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
             EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
         ++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 2adedbb9f6c116a..238c9a7c0abad43 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -614,24 +614,8 @@ void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
   printSuccessors(O, Indent);
 }
 #endif
-static std::pair<VPBlockBase *, VPBlockBase *> cloneSESE(VPBlockBase *Entry);
-
-static VPBlockBase *cloneVPB(VPBlockBase *BB) {
-  if (auto *VPBB = dyn_cast<VPBasicBlock>(BB)) {
-    auto *NewBlock = new VPBasicBlock(VPBB->getName());
-    for (VPRecipeBase &R : *VPBB)
-      NewBlock->appendRecipe(R.clone());
-    return NewBlock;
-  }
 
-  auto *VPR = cast<VPRegionBlock>(BB);
-  const auto &[NewEntry, NewExiting] = cloneSESE(VPR->getEntry());
-  auto *NewRegion = new VPRegionBlock(NewEntry, NewExiting, VPR->getName(),
-                                      VPR->isReplicator());
-  for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
-    Block->setParent(NewRegion);
-  return NewRegion;
-}
+static std::pair<VPBlockBase *, VPBlockBase *> cloneSESE(VPBlockBase *Entry);
 
 // Clone the CFG for all nodes in the single-entry-single-exit region reachable
 // from \p Entry, this includes cloning the blocks and their recipes. Operands
@@ -643,7 +627,7 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneSESE(VPBlockBase *Entry) {
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Entry);
   for (VPBlockBase *BB : RPOT) {
-    VPBlockBase *NewBB = cloneVPB(BB);
+    VPBlockBase *NewBB = BB->clone();
     for (VPBlockBase *Pred : BB->getPredecessors())
       VPBlockUtils::connectBlocks(Old2NewVPBlocks[Pred], NewBB);
 
@@ -670,6 +654,15 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneSESE(VPBlockBase *Entry) {
                         Old2NewVPBlocks[*reverse(RPOT).begin()]);
 }
 
+VPRegionBlock *VPRegionBlock::clone() {
+  const auto &[NewEntry, NewExiting] = cloneSESE(getEntry());
+  auto *NewRegion =
+      new VPRegionBlock(NewEntry, NewExiting, getName(), isReplicator());
+  for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
+    Block->setParent(NewRegion);
+  return NewRegion;
+}
+
 void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
   for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
     // Drop all references in VPBasicBlocks and replace all uses with
@@ -1053,7 +1046,6 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
            VPBlockUtils::blocksOnly<VPBasicBlock>(NewDeepRPOT))) {
     assert(OldBB->getRecipeList().size() == NewBB->getRecipeList().size() &&
            "blocks must have the same number of recipes");
-
     for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB)) {
       assert(OldR.getNumOperands() == NewR.getNumOperands() &&
              "recipes must have the same number of operands");
@@ -1076,16 +1068,14 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
   }
 }
 
-VPlan *VPlan::clone() {
-  DenseMap<VPValue *, VPValue *> Old2NewVPValues;
-
+VPlan *VPlan::duplicate() {
   // Clone blocks.
-  VPBlockBase *NewPreheader = cloneVPB(Preheader);
-  const auto &[NewEntry, __] = cloneSESE(getEntry());
+  VPBasicBlock *NewPreheader = Preheader->clone();
+  const auto &[NewEntry, __] = cloneSESE(Entry);
 
   // Create VPlan, clone live-ins and remap operands in the cloned blocks.
-  auto *NewPlan =
-      new VPlan(cast<VPBasicBlock>(NewPreheader), cast<VPBasicBlock>(NewEntry));
+  auto *NewPlan = new VPlan(NewPreheader, cast<VPBasicBlock>(NewEntry));
+  DenseMap<VPValue *, VPValue *> Old2NewVPValues;
   for (VPValue *OldLiveIn : VPLiveInsToFree) {
     VPValue *NewLiveIn = new VPValue(OldLiveIn->getLiveInIRValue());
     NewPlan->VPLiveInsToFree.push_back(NewLiveIn);
@@ -1111,8 +1101,6 @@ VPlan *VPlan::clone() {
     NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
 
   // Initialize remaining fields of cloned VPlan.
-  NewEntry->setPlan(NewPlan);
-  NewPreheader->setPlan(NewPlan);
   NewPlan->VFs = VFs;
   NewPlan->UFs = UFs;
   // TODO: Adjust names.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 55912361d692e8a..3f2a55080bd53fb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -662,6 +662,11 @@ class VPBlockBase {
   /// Dump this VPBlockBase to dbgs().
   LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
 #endif
+
+  /// Clone the current block and it's recipes without updating the operands of
+  /// the cloned recipes, including all blocks in the single-entry single-exit
+  /// region for VPRegionBlocks.
+  virtual VPBlockBase *clone() = 0;
 };
 
 /// A value that is used outside the VPlan. The operand of the user needs to be
@@ -2681,6 +2686,15 @@ class VPBasicBlock : public VPBlockBase {
   /// Returns true if the block is exiting it's parent region.
   bool isExiting() const;
 
+  /// Clone the current block and it's recipes, without updating the operands of
+  /// the cloned recipes.
+  VPBasicBlock *clone() override {
+    auto *NewBlock = new VPBasicBlock(getName());
+    for (VPRecipeBase &R : *this)
+      NewBlock->appendRecipe(R.clone());
+    return NewBlock;
+  }
+
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
   /// VPBasicBlock, and return it. Update the CFGState accordingly.
@@ -2785,6 +2799,10 @@ class VPRegionBlock : public VPBlockBase {
              VPSlotTracker &SlotTracker) const override;
   using VPBlockBase::print; // Get the print(raw_stream &O) version.
 #endif
+
+  /// Clone all blocks in the single-entry single-exit region of the block and
+  /// their recipes without updating the operands of the cloned recipes.
+  VPRegionBlock *clone() override;
 };
 
 /// VPlan models a candidate for vectorization, encoding various decisions take
@@ -3038,8 +3056,9 @@ class VPlan {
   VPBasicBlock *getPreheader() { return Preheader; }
   const VPBasicBlock *getPreheader() const { return Preheader; }
 
-  /// Clone the current VPlan and return it.
-  VPlan *clone();
+  /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
+  /// recipes to refer to the clones, and return it.
+  VPlan *duplicate();
 
 private:
   /// Add to the given dominator tree the header block and every new basic block

>From 0033a40f69e4dc022c2ea8e36ef76937cabd00c5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 27 Jan 2024 12:56:14 +0000
Subject: [PATCH 5/5] !fixup implement clone() for VPScalarCastRecipe.

---
 llvm/lib/Transforms/Vectorize/VPlan.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 668a7d17b011269..20792cb9ac7c1c7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1389,6 +1389,10 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {
 
   ~VPScalarCastRecipe() override = default;
 
+  VPRecipeBase *clone() override {
+    return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPScalarCastSC)
 
   void execute(VPTransformState &State) override;
@@ -2495,10 +2499,9 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
 
   VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
                     const FPMathOperator *FPBinOp, VPValue *Start,
-                    VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
-                    Type *TruncResultTy)
+                    VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step)
       : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
-        TruncResultTy(TruncResultTy), Kind(Kind), FPBinOp(FPBinOp) {}
+        Kind(Kind), FPBinOp(FPBinOp) {}
 
 public:
   VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
@@ -2512,8 +2515,7 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
 
   VPRecipeBase *clone() override {
     return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(),
-                                 getCanonicalIV(), getStepValue(),
-                                 TruncResultTy);
+                                 getCanonicalIV(), getStepValue());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC)



More information about the libcxx-commits mailing list