[llvm] 505ad03 - [LV] Remove redundant IV casts using VPlan (NFCI).
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 10 05:57:31 PST 2021
Author: Florian Hahn
Date: 2021-12-10T13:57:03Z
New Revision: 505ad03c7d29c0937d46dc24b13b2e9ea2a6476a
URL: https://github.com/llvm/llvm-project/commit/505ad03c7d29c0937d46dc24b13b2e9ea2a6476a
DIFF: https://github.com/llvm/llvm-project/commit/505ad03c7d29c0937d46dc24b13b2e9ea2a6476a.diff
LOG: [LV] Remove redundant IV casts using VPlan (NFCI).
This patch simplifies handling of redundant induction casts, by
removing dead cast instructions after initial VPlan construction.
This has the following benefits:
1. fixes a crash
(see @test_optimized_cast_induction_feeding_first_order_recurrence)
2. Simplifies VPWidenIntOrFpInduction to a single-def recipes
3. Retires recordVectorLoopValueForInductionCast.
Reviewed By: Ayal
Differential Revision: https://reviews.llvm.org/D115112
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlan.h
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
llvm/lib/Transforms/Vectorize/VPlanTransforms.h
llvm/test/Transforms/LoopVectorize/induction.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 410bf9881285d..a96c844652dba 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -510,7 +510,7 @@ class InnerLoopVectorizer {
/// the corresponding type.
void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
Value *Start, TruncInst *Trunc, VPValue *Def,
- VPValue *CastDef, VPTransformState &State);
+ VPTransformState &State);
/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
@@ -621,7 +621,7 @@ class InnerLoopVectorizer {
/// can also be a truncate instruction.
void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
const InductionDescriptor &ID, VPValue *Def,
- VPValue *CastDef, VPTransformState &State);
+ VPTransformState &State);
/// Create a vector induction phi node based on an existing scalar one. \p
/// EntryVal is the value from the original loop that maps to the vector phi
@@ -631,7 +631,6 @@ class InnerLoopVectorizer {
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
Value *Step, Value *Start,
Instruction *EntryVal, VPValue *Def,
- VPValue *CastDef,
VPTransformState &State);
/// Returns true if an instruction \p I should be scalarized instead of
@@ -641,29 +640,6 @@ class InnerLoopVectorizer {
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;
- /// If there is a cast involved in the induction variable \p ID, which should
- /// be ignored in the vectorized loop body, this function records the
- /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
- /// cast. We had already proved that the casted Phi is equal to the uncasted
- /// Phi in the vectorized loop (under a runtime guard), and therefore
- /// there is no need to vectorize the cast - the same value can be used in the
- /// vector loop for both the Phi and the cast.
- /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
- /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
- ///
- /// \p EntryVal is the value from the original loop that maps to the vector
- /// phi node and is used to distinguish what is the IV currently being
- /// processed - original one (if \p EntryVal is a phi corresponding to the
- /// original IV) or the "newly-created" one based on the proof mentioned above
- /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
- /// latter case \p EntryVal is a TruncInst and we must not record anything for
- /// that IV, but it's error-prone to expect callers of this routine to care
- /// about that, hence this explicit parameter.
- void recordVectorLoopValueForInductionCast(
- const InductionDescriptor &ID, const Instruction *EntryVal,
- Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
- unsigned Part, unsigned Lane = UINT_MAX);
-
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
@@ -2358,8 +2334,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Value *Start,
- Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
- VPTransformState &State) {
+ Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
"Expected either an induction phi-node or a truncate of it!");
@@ -2422,8 +2397,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
- recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
- State, Part);
LastInduction = cast<Instruction>(
Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
@@ -2457,42 +2430,10 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
return llvm::any_of(IV->users(), isScalarInst);
}
-void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
- const InductionDescriptor &ID, const Instruction *EntryVal,
- Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
- unsigned Part, unsigned Lane) {
- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
- "Expected either an induction phi-node or a truncate of it!");
-
- // This induction variable is not the phi from the original loop but the
- // newly-created IV based on the proof that casted Phi is equal to the
- // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
- // re-uses the same InductionDescriptor that original IV uses but we don't
- // have to do any recording in this case - that is done when original IV is
- // processed.
- if (isa<TruncInst>(EntryVal))
- return;
-
- if (!CastDef) {
- assert(ID.getCastInsts().empty() &&
- "there are casts for ID, but no CastDef");
- return;
- }
- assert(!ID.getCastInsts().empty() &&
- "there is a CastDef, but no casts for ID");
- // Only the first Cast instruction in the Casts vector is of interest.
- // The rest of the Casts (if exist) have no uses outside the
- // induction update chain itself.
- if (Lane < UINT_MAX)
- State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
- else
- State.set(CastDef, VectorLoopVal, Part);
-}
-
void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
const InductionDescriptor &ID,
Value *Start, TruncInst *Trunc,
- VPValue *Def, VPValue *CastDef,
+ VPValue *Def,
VPTransformState &State) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
@@ -2558,8 +2499,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
State.set(Def, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
- recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
- State, Part);
}
};
@@ -2581,8 +2520,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
// least one user in the loop that is not widened.
auto NeedsScalarIV = needsScalarInduction(EntryVal);
if (!NeedsScalarIV) {
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
- State);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
return;
}
@@ -2590,14 +2528,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (!shouldScalarizeInstruction(EntryVal)) {
- createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
- State);
+ createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
Value *ScalarIV = CreateScalarIV(Step);
// Create scalar steps that can be used by instructions we will later
// scalarize. Note that the addition of the scalar steps will not increase
// the number of instructions in the loop in the common case prior to
// InstCombine. We will be trading one vector extract for each scalar step.
- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
return;
}
@@ -2607,7 +2544,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
Value *ScalarIV = CreateScalarIV(Step);
if (!Cost->isScalarEpilogueAllowed())
CreateSplatIV(ScalarIV, Step);
- buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
}
Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
@@ -2661,7 +2598,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Instruction *EntryVal,
const InductionDescriptor &ID,
- VPValue *Def, VPValue *CastDef,
+ VPValue *Def,
VPTransformState &State) {
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF.isVector() && "VF should be greater than one");
@@ -2711,8 +2648,6 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
State.set(Def, Add, Part);
- recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
- Part);
// It's useful to record the lane values too for the known minimum number
// of elements so we do those below. This improves the code quality when
// trying to extract the first element, for example.
@@ -2732,8 +2667,6 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
State.set(Def, Add, VPIteration(Part, Lane));
- recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
- Part, Lane);
}
}
}
@@ -8063,18 +7996,6 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
return U == Ind || DeadInstructions.count(cast<Instruction>(U));
}))
DeadInstructions.insert(IndUpdate);
-
- // We record as "Dead" also the type-casting instructions we had identified
- // during induction analysis. We don't need any handling for them in the
- // vectorized loop because we have proven that, under a proper runtime
- // test guarding the vectorized loop, the value of the phi, and the casted
- // value of the phi, are the same. The last instruction in this casting chain
- // will get its scalar/vector/widened def from the scalar/vector/widened def
- // of the respective phi node. Any other casts in the induction def-use chain
- // have no other uses outside the phi update chain, and will be ignored.
- const InductionDescriptor &IndDes = Induction.second;
- const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
- DeadInstructions.insert(Casts.begin(), Casts.end());
}
}
@@ -8593,9 +8514,7 @@ VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
assert(II->getStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
- const SmallVectorImpl<Instruction *> &Casts = II->getCastInsts();
- return new VPWidenIntOrFpInductionRecipe(
- Phi, Operands[0], *II, Casts.empty() ? nullptr : Casts.front());
+ return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
}
return nullptr;
@@ -9235,6 +9154,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
+ VPlanTransforms::removeRedundantInductionCasts(*Plan);
+
// Now that sink-after is done, move induction recipes for optimized truncates
// to the phi section of the header block.
for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
@@ -9715,9 +9636,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
- State.ILV->widenIntOrFpInduction(
- IV, getInductionDescriptor(), getStartValue()->getLiveInIRValue(),
- getTruncInst(), getVPValue(0), getCastValue(), State);
+ State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
+ getStartValue()->getLiveInIRValue(),
+ getTruncInst(), getVPValue(0), State);
}
void VPWidenPHIRecipe::execute(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4d72ae83f5d09..16dfbfbe863d1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1004,29 +1004,21 @@ class VPWidenGEPRecipe : public VPRecipeBase, public VPValue {
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector and scalar values.
-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
PHINode *IV;
const InductionDescriptor &IndDesc;
public:
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
- const InductionDescriptor &IndDesc,
- Instruction *Cast = nullptr)
- : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV),
- IndDesc(IndDesc) {
- new VPValue(IV, this);
-
- if (Cast)
- new VPValue(Cast, this);
- }
+ const InductionDescriptor &IndDesc)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),
+ IV(IV), IndDesc(IndDesc) {}
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
const InductionDescriptor &IndDesc,
TruncInst *Trunc)
- : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV),
- IndDesc(IndDesc) {
- new VPValue(Trunc, this);
- }
+ : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),
+ IV(IV), IndDesc(IndDesc) {}
~VPWidenIntOrFpInductionRecipe() override = default;
@@ -1048,13 +1040,6 @@ class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
/// Returns the start value of the induction.
VPValue *getStartValue() { return getOperand(0); }
- /// Returns the cast VPValue, if one is attached, or nullptr otherwise.
- VPValue *getCastValue() {
- if (getNumDefinedValues() != 2)
- return nullptr;
- return getVPValue(1);
- }
-
/// Returns the first defined value as TruncInst, if it is one or nullptr
/// otherwise.
TruncInst *getTruncInst() {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5f1b23300581c..1e6c014392d1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -294,3 +294,35 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
delete ToDelete;
return Changed;
}
+
+void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
+ SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove;
+ for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) {
+ auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+ if (!IV || IV->getTruncInst())
+ continue;
+
+ // Visit all casts connected to IV and in Casts. Collect them.
+ // remember them for removal.
+ auto &Casts = IV->getInductionDescriptor().getCastInsts();
+ VPValue *FindMyCast = IV;
+ for (Instruction *IRCast : reverse(Casts)) {
+ VPRecipeBase *FoundUserCast = nullptr;
+ for (auto *U : FindMyCast->users()) {
+ auto *UserCast = cast<VPRecipeBase>(U);
+ if (UserCast->getNumDefinedValues() == 1 &&
+ UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) {
+ FoundUserCast = UserCast;
+ break;
+ }
+ }
+ assert(FoundUserCast && "Missing a cast to remove");
+ CastsToRemove.emplace_back(FoundUserCast, IV);
+ FindMyCast = FoundUserCast->getVPSingleValue();
+ }
+ }
+ for (auto &E : CastsToRemove) {
+ E.first->getVPSingleValue()->replaceAllUsesWith(E.second);
+ E.first->eraseFromParent();
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index bac0ba946d613..a82a562d5e353 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -37,6 +37,14 @@ struct VPlanTransforms {
static bool sinkScalarOperands(VPlan &Plan);
static bool mergeReplicateRegions(VPlan &Plan);
+
+ /// Remove redundant casts of inductions.
+ ///
+ /// Such redundant casts are casts of induction variables that can be ignored,
+ /// because we already proved that the casted phi is equal to the uncasted phi
+ /// in the vectorized loop. There is no need to vectorize the cast - the same
+ /// value can be used for both the phi and casts in the vector loop.
+ static void removeRedundantInductionCasts(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index cdb1dc04a604f..75326216c6d75 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -933,3 +933,41 @@ loop:
exit:
ret void
}
+
+; Test case where %iv.2.ext and %iv.2.conv become redundant due to the SCEV
+; predicates generated for the vector loop. They should be removed in the
+; vector loop.
+define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n, i32 %step, i32* %ptr) {
+; CHECK-LABEL: @test_optimized_cast_induction_feeding_first_order_recurrence(
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT: [[MAIN_IV:%.+]] = phi i64 [ 0, %vector.ph ], [ [[MAIN_IV_NEXT:%.+]], %vector.body ]
+; CHECK-NEXT: [[VEC_RECUR:%.+]] = phi <2 x i32> [ <i32 poison, i32 0>, %vector.ph ], [ [[VEC_IV:%.+]], %vector.body ]
+; CHECK-NEXT: [[VEC_IV]] = phi <2 x i32> [ %induction, %vector.ph ], [ [[VEC_IV_NEXT:%.+]], %vector.body ]
+; CHECK-NEXT: [[MAIN_IV_0:%.+]] = add i64 [[MAIN_IV]], 0
+; CHECK-NEXT: [[RECUR_SHUFFLE:%.+]] = shufflevector <2 x i32> [[VEC_RECUR]], <2 x i32> [[VEC_IV]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[GEP0:%.+]] = getelementptr inbounds i32, i32* %ptr, i64 [[MAIN_IV_0]]
+; CHECK-NEXT: [[GEP1:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 0
+; CHECK-NEXT: [[GEP_CAST:%.+]] = bitcast i32* [[GEP1]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[RECUR_SHUFFLE]], <2 x i32>* [[GEP_CAST]], align 4
+; CHECK-NEXT: [[MAIN_IV_NEXT]] = add nuw i64 [[MAIN_IV]], 2
+; CHECK-NEXT: [[VEC_IV_NEXT]] = add <2 x i32> [[VEC_IV]],
+;
+entry:
+ br label %loop
+
+loop:
+ %for = phi i32 [ 0, %entry ], [ %iv.2.conv, %loop ]
+ %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
+ %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+ %iv.2.ext = shl i32 %iv.2, 24
+ %iv.2.conv = ashr exact i32 %iv.2.ext, 24
+ %gep = getelementptr inbounds i32, i32* %ptr, i64 %iv.1
+ store i32 %for, i32* %gep, align 4
+ %iv.2.next = add nsw i32 %iv.2.conv, %step
+ %iv.1.next = add nuw nsw i64 %iv.1, 1
+ %exitcond = icmp eq i64 %iv.1.next, %n
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
More information about the llvm-commits
mailing list