[llvm] [VPlan] Materialize Build(Struct)Vectors for VPReplicateRecipes. (NFCI) (PR #151487)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 31 03:28:14 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

Materialze Build(Struct)Vectors explicitly for VPRecplicateRecipes, to serve their users requiring a vector, instead of doing so when unrolling by VF.

Now we only need to implicitly build vectors in VPTransformState::get for VPInstructions. Once they are also unrolled by VF we can remove the code-path alltogether.

---
Full diff: https://github.com/llvm/llvm-project/pull/151487.diff


6 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.cpp (+2) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+2) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+45) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+4) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp (+29-16) 


``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 850c4a11edc67..9c547f759617c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7284,6 +7284,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // cost model is complete for better cost estimates.
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
                            OrigLoop->getHeader()->getContext());
+  VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
   VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
   VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
   bool HasBranchWeights =
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 25b9616880bf4..96ccf5bf50a25 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -372,6 +372,8 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
     set(Def, VectorValue);
   } else {
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+    assert(isa<VPInstruction>(Def) && "Explicit BuildVector recipes must "
+                                      "handle packing for non-VPInstructions.");
     // Initialize packing with insertelements to start from poison.
     VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 68e7c20a070f4..7689ed6c2bd35 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -461,6 +461,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case Instruction::Load:
   case VPInstruction::AnyOf:
   case VPInstruction::BranchOnCond:
+  case VPInstruction::BuildStructVector:
+  case VPInstruction::BuildVector:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExplicitVectorLength:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ad8235d891c5d..4c0f7f40408c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3192,6 +3192,51 @@ void VPlanTransforms::materializeVectorTripCount(
     Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue());
 }
 
+void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
+  if (Plan.hasScalarVFOnly())
+    return;
+
+  VPTypeAnalysis TypeInfo(Plan);
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+      vp_depth_first_shallow(Plan.getEntry()));
+  auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+      vp_depth_first_shallow(LoopRegion->getEntry()));
+  for (VPBasicBlock *VPBB :
+       concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+      if (!RepR || RepR->isSingleScalar())
+        continue;
+      VPInstruction *BuildVector = nullptr;
+      for (VPUser *U : to_vector(RepR->users())) {
+        VPRegionBlock *ParentRegion =
+            cast<VPRecipeBase>(U)->getParent()->getParent();
+        if (U->usesScalars(RepR) && ParentRegion == LoopRegion)
+          continue;
+
+        if (!BuildVector) {
+          Type *ScalarTy = TypeInfo.inferScalarType(RepR);
+          unsigned Opc = ScalarTy->isStructTy()
+                             ? VPInstruction::BuildStructVector
+                             : VPInstruction::BuildVector;
+          BuildVector = new VPInstruction(Opc, {RepR});
+          BuildVector->insertAfter(RepR);
+        }
+
+        // Only update a single operand per users, as the same user is added
+        // multiple times, once per use.
+        // TODO: Introduce de-duplicating iterator over users.
+        for (unsigned Idx = 0; Idx != U->getNumOperands(); ++Idx)
+          if (U->getOperand(Idx) == RepR) {
+            U->setOperand(Idx, BuildVector);
+            break;
+          }
+      }
+    }
+  }
+}
+
 /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
 /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
 /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 880159f760922..1a19e15bbaa25 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -256,6 +256,10 @@ struct VPlanTransforms {
                                          unsigned BestUF,
                                          PredicatedScalarEvolution &PSE);
 
+  /// Add explicit Build[Struct]Vector recipes that combine scalar values
+  /// produced by VPReplicateRecipes to a single vector.
+  static void materializeBuildVectors(VPlan &Plan);
+
   /// Try to convert a plan with interleave groups with VF elements to a plan
   /// with the interleave groups replaced by wide loads and stores processing VF
   /// elements, if all transformed interleave groups access the full vector
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 871e37ef3966a..7f23fb5b7d11a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -463,9 +463,10 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
 }
 
 /// Create a single-scalar clone of \p RepR for lane \p Lane.
-static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
-                                       Type *IdxTy, VPReplicateRecipe *RepR,
-                                       VPLane Lane) {
+static VPReplicateRecipe *
+cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
+             VPReplicateRecipe *RepR, VPLane Lane,
+             DenseMap<VPValue *, SmallVector<VPValue *>> &Value2Lanes) {
   // Collect the operands at Lane, creating extracts as needed.
   SmallVector<VPValue *> NewOps;
   for (VPValue *Op : RepR->operands()) {
@@ -478,6 +479,11 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
           Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
       continue;
     }
+    if (Value2Lanes.contains(Op)) {
+      NewOps.push_back(Value2Lanes[Op][Lane.getKnownLane()]);
+      continue;
+    }
+
     // Look through buildvector to avoid unnecessary extracts.
     if (match(Op, m_BuildVector())) {
       NewOps.push_back(
@@ -510,6 +516,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
       vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()));
   auto VPBBsToUnroll =
       concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
+  DenseMap<VPValue *, SmallVector<VPValue *>> Value2Lanes;
+  SmallVector<VPRecipeBase *> ToRemove;
   for (VPBasicBlock *VPBB : VPBBsToUnroll) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
@@ -521,12 +529,12 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
         if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
             vputils::isSingleScalar(RepR->getOperand(1))) {
           // Stores to invariant addresses need to store the last lane only.
-          cloneForLane(Plan, Builder, IdxTy, RepR,
-                       VPLane::getLastLaneForVF(VF));
+          cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
+                       Value2Lanes);
         } else {
           // Create single-scalar version of RepR for all lanes.
           for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
-            cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I));
+            cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Value2Lanes);
         }
         RepR->eraseFromParent();
         continue;
@@ -534,23 +542,28 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
       /// Create single-scalar version of RepR for all lanes.
       SmallVector<VPValue *> LaneDefs;
       for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
-        LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
+        LaneDefs.push_back(
+            cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Value2Lanes));
 
+      Value2Lanes[RepR] = LaneDefs;
       /// Users that only demand the first lane can use the definition for lane
       /// 0.
       RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
         return U.onlyFirstLaneUsed(RepR);
       });
 
-      // If needed, create a Build(Struct)Vector recipe to insert the scalar
-      // lane values into a vector.
-      Type *ResTy = RepR->getUnderlyingInstr()->getType();
-      VPValue *VecRes = Builder.createNaryOp(
-          ResTy->isStructTy() ? VPInstruction::BuildStructVector
-                              : VPInstruction::BuildVector,
-          LaneDefs);
-      RepR->replaceAllUsesWith(VecRes);
-      RepR->eraseFromParent();
+      for (VPUser *U : to_vector(RepR->users())) {
+        auto *VPI = dyn_cast<VPInstruction>(U);
+        if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
+                     VPI->getOpcode() != VPInstruction::BuildStructVector))
+          continue;
+        VPI->setOperand(0, LaneDefs[0]);
+        for (VPValue *Def : drop_begin(LaneDefs))
+          VPI->addOperand(Def);
+      }
+      ToRemove.push_back(RepR);
     }
   }
+  for (auto *R : reverse(ToRemove))
+    R->eraseFromParent();
 }

``````````

</details>


https://github.com/llvm/llvm-project/pull/151487


More information about the llvm-commits mailing list