[llvm] [VPlan] Dissolve replicate regions with vector live-outs. (PR #189022)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 27 08:15:03 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-systemz
Author: Florian Hahn (fhahn)
<details>
<summary>Changes</summary>
Remove the scalar VF restriction and properly handle replicate regions
with vector live outs.
After unrolling the replicate regions, we end up with a set of scalar
VPPhis. The current patch post-processes them and converts them to
a chain of InsertElement + VPWidenPHiRecipes to match original codegen
as closely as possible.
An alternative would be to keep the phis scalar and combine them with
BuildVector at the end, but that would result in quite different
codegen.
Now that ::execute for replicate regions is dead, clean up
VPTransformState::Lane and various ::execute that relied on it.
Depends on https://github.com/llvm/llvm-project/pull/186252 (included in PR)
---
Patch is 326.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/189022.diff
50 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VPlan.cpp (+11-42)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanHelpers.h (+1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+22-96)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+1-4)
- (modified) llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp (+94-9)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll (+2-3)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll (+6-9)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll (+4-8)
- (modified) llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll (+2-6)
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/interleave-and-scalarize-only.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll (+50-96)
- (modified) llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll (-6)
- (modified) llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll (+2-6)
- (modified) llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll (-16)
- (modified) llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll (+9-16)
- (modified) llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll (-2)
- (modified) llvm/test/Transforms/LoopVectorize/X86/predicated-udiv.ll (+8-24)
- (modified) llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll (+9-22)
- (modified) llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll (+120-200)
- (modified) llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/cast-induction.ll (+3-4)
- (modified) llvm/test/Transforms/LoopVectorize/cse-casts.ll (+1-10)
- (modified) llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll (+6-2)
- (modified) llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll (+20-40)
- (modified) llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll (+5-10)
- (modified) llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll (+18-54)
- (modified) llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll (+14-40)
- (modified) llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll (+12-28)
- (modified) llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll (+4-14)
- (modified) llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/induction.ll (+4-8)
- (modified) llvm/test/Transforms/LoopVectorize/iv-select-cmp-fold-tail.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll (+5-10)
- (modified) llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll (-2)
- (modified) llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll (+4-6)
- (modified) llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll (+2-4)
- (modified) llvm/test/Transforms/LoopVectorize/reduction-inloop.ll (-12)
- (modified) llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll (+1-2)
- (modified) llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll (+44-48)
- (modified) llvm/test/Transforms/LoopVectorize/tail-folding-div.ll (+12-36)
- (modified) llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll (+2-3)
- (modified) llvm/test/Transforms/LoopVectorize/uniform-blend.ll (-4)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 919166099a4d1..2bcab2c341e61 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -491,7 +491,8 @@ VPIRBasicBlock *VPIRBasicBlock::clone() {
}
void VPBasicBlock::execute(VPTransformState *State) {
- bool Replica = bool(State->Lane);
+ assert(!State->Lane &&
+ "replicate regions must be dissolved before ::execute");
BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
if (VPBlockUtils::isHeader(this, State->VPDT)) {
@@ -507,31 +508,17 @@ void VPBasicBlock::execute(VPTransformState *State) {
State->LI->addTopLevelLoop(State->CurrentParentLoop);
}
- auto IsReplicateRegion = [](VPBlockBase *BB) {
- auto *R = dyn_cast_or_null<VPRegionBlock>(BB);
- assert((!R || R->isReplicator()) &&
- "only replicate region blocks should remain");
- return R;
- };
// 1. Create an IR basic block.
- if ((Replica && this == getParent()->getEntry()) ||
- IsReplicateRegion(getSingleHierarchicalPredecessor())) {
- // Reuse the previous basic block if the current VPBB is either
- // * the entry to a replicate region, or
- // * the exit of a replicate region.
- State->CFG.VPBB2IRBB[this] = NewBB;
- } else {
- NewBB = createEmptyBasicBlock(*State);
+ NewBB = createEmptyBasicBlock(*State);
- State->Builder.SetInsertPoint(NewBB);
- // Temporarily terminate with unreachable until CFG is rewired.
- UnreachableInst *Terminator = State->Builder.CreateUnreachable();
- State->Builder.SetInsertPoint(Terminator);
+ State->Builder.SetInsertPoint(NewBB);
+ // Temporarily terminate with unreachable until CFG is rewired.
+ UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+ State->Builder.SetInsertPoint(Terminator);
- State->CFG.PrevBB = NewBB;
- State->CFG.VPBB2IRBB[this] = NewBB;
- connectToPredecessors(*State);
- }
+ State->CFG.PrevBB = NewBB;
+ State->CFG.VPBB2IRBB[this] = NewBB;
+ connectToPredecessors(*State);
// 2. Fill the IR basic block with IR instructions.
executeRecipes(State, NewBB);
@@ -754,25 +741,7 @@ VPRegionBlock *VPRegionBlock::clone() {
}
void VPRegionBlock::execute(VPTransformState *State) {
- assert(isReplicator() &&
- "Loop regions should have been lowered to plain CFG");
- assert(!State->Lane && "Replicating a Region with non-null instance.");
- assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
-
- ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
- Entry);
- State->Lane = VPLane(0);
- for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) {
- State->Lane = VPLane(Lane, VPLane::Kind::First);
- // Visit the VPBlocks connected to \p this, starting from it.
- for (VPBlockBase *Block : RPOT) {
- LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
- Block->execute(State);
- }
- }
-
- // Exit replicating mode.
- State->Lane.reset();
+ llvm_unreachable("regions must get dissolved before ::execute");
}
InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 9667227a329ac..d25ec69468c5c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -76,6 +76,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
switch (Opcode) {
case Instruction::ExtractElement:
+ case Instruction::InsertElement:
case Instruction::Freeze:
case Instruction::PHI:
case VPInstruction::Broadcast:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 0d4a842c700f6..6b34ee62489c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -210,6 +210,7 @@ struct VPTransformState {
/// Hold the index to generate specific scalar instructions. Null indicates
/// that all instances are to be generated, using either scalar or vector
/// instructions.
+ /// TODO: This is now only used in asserts. Remove as follow-up.
std::optional<VPLane> Lane;
struct DataState {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8f937b2f8bc1f..bafa2ba779792 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -482,6 +482,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
case VPInstruction::CalculateTripCountMinusVF:
return 2;
case Instruction::Select:
+ case Instruction::InsertElement:
case VPInstruction::ActiveLaneMask:
case VPInstruction::ReductionStartVector:
return 3;
@@ -571,6 +572,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
return Builder.CreateExtractElement(Vec, Idx, Name);
}
+ case Instruction::InsertElement: {
+ Value *Vec = State.get(getOperand(0));
+ Value *Elt = State.get(getOperand(1), /*IsScalar=*/true);
+ Value *Idx = State.get(getOperand(2), /*IsScalar=*/true);
+ return Builder.CreateInsertElement(Vec, Elt, Idx, Name);
+ }
case Instruction::Freeze: {
Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
return Builder.CreateFreeze(Op, Name);
@@ -1333,6 +1340,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
switch (getOpcode()) {
case Instruction::GetElementPtr:
case Instruction::ExtractElement:
+ case Instruction::InsertElement:
case Instruction::Freeze:
case Instruction::FCmp:
case Instruction::ICmp:
@@ -1386,6 +1394,8 @@ bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
return false;
case Instruction::ExtractElement:
return Op == getOperand(1);
+ case Instruction::InsertElement:
+ return Op != getOperand(0);
case Instruction::PHI:
return true;
case Instruction::FCmp:
@@ -1637,7 +1647,11 @@ void VPPhi::execute(VPTransformState &State) {
PHINode *NewPhi = State.Builder.CreatePHI(
State.TypeAnalysis.inferScalarType(this), 2, getName());
unsigned NumIncoming = getNumIncoming();
- if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
+ // Detect header phis: the parent block dominates its second incoming block
+ // (the latch). Non-header phis, e.g. from dissolved replicate regions, don't
+ // have this property.
+ if (NumIncoming == 2 &&
+ State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
// TODO: Fixup all incoming values of header phis once recipes defining them
// are introduced.
NumIncoming = 1;
@@ -2588,10 +2602,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
unsigned StartLane = 0;
unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
- if (State.Lane) {
- StartLane = State.Lane->getKnownLane();
- EndLane = StartLane + 1;
- }
+ assert(!State.Lane && "must be replicated");
Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
: Constant::getNullValue(BaseIVTy);
@@ -3306,28 +3317,11 @@ static void scalarizeInstruction(const Instruction *Instr,
}
void VPReplicateRecipe::execute(VPTransformState &State) {
+ assert(!State.Lane);
Instruction *UI = getUnderlyingInstr();
-
- if (!State.Lane) {
- assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
- "must have already been unrolled");
- scalarizeInstruction(UI, this, VPLane(0), State);
- return;
- }
-
- assert((State.VF.isScalar() || !isSingleScalar()) &&
- "uniform recipe shouldn't be predicated");
- assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
- scalarizeInstruction(UI, this, *State.Lane, State);
- // Insert scalar instance packing it into a vector.
- if (State.VF.isVector() && shouldPack()) {
- Value *WideValue =
- State.Lane->isFirstLane()
- ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
- : State.get(this);
- State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
- *State.Lane));
- }
+ assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
+ "must have already been unrolled");
+ scalarizeInstruction(UI, this, VPLane(0), State);
}
bool VPReplicateRecipe::shouldPack() const {
@@ -3693,20 +3687,7 @@ void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
#endif
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
- assert(State.Lane && "Branch on Mask works only on single instance.");
-
- VPValue *BlockInMask = getOperand(0);
- Value *ConditionBit = State.get(BlockInMask, *State.Lane);
-
- // Replace the temporary unreachable terminator with a new conditional branch,
- // whose two destinations will be set later when they are created.
- auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
- assert(isa<UnreachableInst>(CurrentTerminator) &&
- "Expected to replace unreachable terminator with conditional branch.");
- auto CondBr =
- State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
- CondBr->setSuccessor(0, nullptr);
- CurrentTerminator->eraseFromParent();
+ llvm_unreachable("recipe must be removed when dissolving replicate region");
}
InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
@@ -3718,62 +3699,7 @@ InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
}
void VPPredInstPHIRecipe::execute(VPTransformState &State) {
- assert(State.Lane && "Predicated instruction PHI works per instance.");
- Instruction *ScalarPredInst =
- cast<Instruction>(State.get(getOperand(0), *State.Lane));
- BasicBlock *PredicatedBB = ScalarPredInst->getParent();
- BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
- assert(PredicatingBB && "Predicated block has no single predecessor.");
- assert(isa<VPReplicateRecipe>(getOperand(0)) &&
- "operand must be VPReplicateRecipe");
-
- // By current pack/unpack logic we need to generate only a single phi node: if
- // a vector value for the predicated instruction exists at this point it means
- // the instruction has vector users only, and a phi for the vector value is
- // needed. In this case the recipe of the predicated instruction is marked to
- // also do that packing, thereby "hoisting" the insert-element sequence.
- // Otherwise, a phi node for the scalar value is needed.
- if (State.hasVectorValue(getOperand(0))) {
- auto *VecI = cast<Instruction>(State.get(getOperand(0)));
- assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
- "Packed operands must generate an insertelement or insertvalue");
-
- // If VectorI is a struct, it will be a sequence like:
- // %1 = insertvalue %unmodified, %x, 0
- // %2 = insertvalue %1, %y, 1
- // %VectorI = insertvalue %2, %z, 2
- // To get the unmodified vector we need to look through the chain.
- if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
- for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
- VecI = cast<InsertValueInst>(VecI->getOperand(0));
-
- PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
- VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
- VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
- if (State.hasVectorValue(this))
- State.reset(this, VPhi);
- else
- State.set(this, VPhi);
- // NOTE: Currently we need to update the value of the operand, so the next
- // predicated iteration inserts its generated value in the correct vector.
- State.reset(getOperand(0), VPhi);
- } else {
- if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
- return;
-
- Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
- PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
- Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
- PredicatingBB);
- Phi->addIncoming(ScalarPredInst, PredicatedBB);
- if (State.hasScalarValue(this, *State.Lane))
- State.reset(this, Phi, *State.Lane);
- else
- State.set(this, Phi, *State.Lane);
- // NOTE: Currently we need to update the value of the operand, so the next
- // predicated iteration inserts its generated value in the correct vector.
- State.reset(getOperand(0), Phi, *State.Lane);
- }
+ llvm_unreachable("recipe must be removed when dissolving replicate region");
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5556304265489..c94545a081fde 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -597,10 +597,7 @@ bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {
auto *ParentRegion = VPBB->getParent();
if (ParentRegion && ParentRegion->getExiting() == VPBB)
ParentRegion->setExiting(PredVPBB);
- for (auto *Succ : to_vector(VPBB->successors())) {
- VPBlockUtils::disconnectBlocks(VPBB, Succ);
- VPBlockUtils::connectBlocks(PredVPBB, Succ);
- }
+ VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
// VPBB is now dead and will be cleaned up when the plan gets destroyed.
}
return !WorkList.empty();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 867c7c8ef5045..3103f6cc8b731 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/Intrinsics.h"
using namespace llvm;
@@ -669,7 +670,8 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
/// VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is
/// converted into BranchOnCond and extracts are created as needed.
static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy,
- VPBlockBase *Entry) {
+ VPBlockBase *Entry,
+ ElementCount VF) {
VPValue *Idx0 = Plan.getZero(IdxTy);
for (VPBlockBase *VPB : vp_depth_first_shallow(Entry)) {
for (VPRecipeBase &OldR : make_early_inc_range(cast<VPBasicBlock>(*VPB))) {
@@ -681,7 +683,8 @@ static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy,
// same block (already scalar), or values that are already single
// scalars.
auto *DefR = Op->getDefiningRecipe();
- if ((isa_and_present<VPScalarIVStepsRecipe>(DefR) &&
+ if (VF.isScalar() ||
+ (isa_and_present<VPScalarIVStepsRecipe>(DefR) &&
DefR->getParent() == VPB) ||
vputils::isSingleScalar(Op))
continue;
@@ -705,6 +708,15 @@ static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy,
{BranchOnMask->getOperand(0)},
BranchOnMask->getDebugLoc());
BranchOnMask->eraseFromParent();
+ } else if (auto *PredPhi = dyn_cast<VPPredInstPHIRecipe>(&OldR)) {
+ VPValue *PredOp = PredPhi->getOperand(0);
+ VPValue *PoisonVal = Plan.getOrAddLiveIn(
+ PoisonValue::get(VPTypeAnalysis(Plan).inferScalarType(PredOp)));
+
+ VPPhi *NewPhi = Builder.createScalarPhi({PoisonVal, PredOp},
+ PredPhi->getDebugLoc());
+ PredPhi->replaceAllUsesWith(NewPhi);
+ PredPhi->eraseFromParent();
} else {
assert((isa<VPScalarIVStepsRecipe>(OldR) ||
(isa<VPInstruction>(OldR) &&
@@ -768,12 +780,22 @@ static void dissolveReplicateRegion(VPRegionBlock *Region, ElementCount VF,
// Process the original blocks for lane 0: converting their recipes to
// single-scalar.
- convertRecipesInRegionBlocksToSingleScalar(Plan, IdxTy, FirstLaneEntry);
+ convertRecipesInRegionBlocksToSingleScalar(Plan, IdxTy, FirstLaneEntry, VF);
+
+ // For scalar VF, just wire the blocks and return; no cloning or packing
+ // needed.
+ if (VF.isScalar()) {
+ VPBlockUtils::connectBlocks(Predecessor, FirstLaneEntry);
+ VPBlockUtils::connectBlocks(FirstLaneExiting, Successor);
+ return;
+ }
// Clone converted blocks for remaining lanes and process each in reverse
// order, connecting each lane's Exiting block to the subsequent lane's entry.
VPBlockBase *NextLaneEntry = Successor;
unsigned NumLanes = VF.getFixedValue();
+ SmallVector<VPBasicBlock *> ExitingBlocks(NumLanes);
+ ExitingBlocks[0] = cast<VPBasicBlock>(FirstLaneExiting);
for (int Lane = NumLanes - 1; Lane > 0; --Lane) {
const auto &[CurrentLaneEntry, CurrentLaneExiting] =
VPBlockUtils::cloneFrom(FirstLaneEntry);
@@ -782,6 +804,7 @@ static void dissolveReplicateRegion(VPRegionBlock *Region, ElementCount VF,
processLaneForReplicateRegion(Plan, IdxTy, Lane,
cast<VPBasicBlock>(FirstLaneEntry),
cast<VPBasicBlock>(CurrentLaneEntry));
+ ExitingBlocks[Lane] = cast<VPBasicBlock>(CurrentLaneExiting);
VPBlockUtils::connectBlocks(CurrentLaneExiting, NextLaneEntry);
NextLaneEntry = CurrentLaneEntry;
}
@@ -792,6 +815,68 @@ static void dissolveReplicateRegion(VPRegionBlock *Region, ElementCount VF,
// FirstLaneExiting.
VPBlockUtils::connectBlocks(Predecessor, FirstLaneEntry);
VPBlockUtils::connectBlocks(FirstLaneExiting, NextLaneEntry);
+
+ // Collect per-lane predicated scalar phis and create InsertElement chains or
+ // BuildStructVector to pack them into a vector result. Phis are in the same
+ // order across all cloned exit blocks, so we can match them by position.
+ SmallVector<SmallVector<VPValue *, 4>> PhisByLane;
+ for (VPBasicBlock *Exit : ExitingBlocks) {
+ auto &Phis = PhisByLane.emplace_back();
+ for (auto &Phi : Exit->phis())
+ Phis.push_back(Phi.getVPSingleValue());
+ }
+
+ auto *LastExit = ExitingBlocks.back();
+ VPBuilder Builder(LastExit, LastExit->end());
+ VPTypeAnalysis TypeInfo(Plan);
+ SmallVector<VPValue *> LaneValues;
+ for (unsigned I = 0; I < PhisByLane[0].size(); ++I) {
+ LaneValues.clear();
+ for (auto &LanePhis : PhisByLane)
+ LaneValues.push_back(LanePhis[I]);
+
+ // If only the first lane is used, no need to pack into a vector.
+ // The lane-0 scalar phi can be used directly.
+ if (vputils::onlyFirstLaneUsed(LaneValues[0]))
+ continue;
+
+ Type *ScalarTy = TypeInfo.inferScalarType(LaneValues[0]);
+
+ // Struct types need BuildStructVector as InsertElement doesn't apply.
+ if (isa<StructType>(ScalarTy)) {
+ auto *BV =
+ Builder.createNaryOp(VPInstruction::BuildStructVector, LaneValues);
+ LaneValues[0]->replaceUsesWithIf(
+ BV, [BV](VPUser &U, unsigned) { return &U != BV; });
+ continue;
+ }
+
+ // Convert chain of scalar phis to InsertElement + VPWidenPHIRecipe chains.
+ VPValue *RunningVec = Plan.getOrAddLiveIn(PoisonValue::get(ScalarTy));
+ for (auto [Lane, LaneVal] : enumerate(LaneValues)) {
+ auto *LanePhi = cast<VPPhi>(LaneVal);
+ assert(LanePhi->getNumOperands() == 2 &&
+ match(LanePhi->getOperand(0), m_Poison()) &&
+ "expected predicated phi");
+ auto *MergeBB = LanePhi->getParent();
+ VPValue *PredVal = LanePhi->getOperand(1);
+ auto *ThenBB = PredVal->getDefiningRecipe()->getParent();
+
+ VPBuilder ThenBuilder(
+ ThenBB, std::next(PredVal->getDefiningRecipe()->getIterator()));
+ VPValue *Idx = Plan.getConstantInt(IdxTy, Lane);
+ auto *Insert = ThenBuilder.createNaryOp(Instruction::InsertElement,
+ ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/189022
More information about the llvm-commits
mailing list