[llvm] [LoopUnroll] Structural cost savings analysis for full loop unrolling (PR #114579)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 20 04:58:36 PST 2024
================
@@ -337,8 +336,239 @@ struct PragmaInfo {
const bool PragmaEnableUnroll;
};
+/// Helper type to estimate per-iteration cost savings coming from fully
+/// unrolling a loop.
+///
+/// The analysis maintains a set of "known instructions" inside the loop (i.e.,
+/// instructions whose result will be statically known after loop unrolling)
+/// that we assume will be entirely removable if the loop is fully unrolled.
+/// These instructions' cost can be deducted from the unrolled cost when
+/// comparing against a threshold.
+struct FullUnrollCostSavings {
+ FullUnrollCostSavings(const Loop *L) : L(L) {}
+
+ /// Returns whether the instruction is known.
+ inline bool isKnown(const Instruction *I) const {
+ return KnownVals.contains(I);
+ }
+
+ /// If the value is an instruction, returns whether that instruction is known,
+ /// false otherwise.
+ bool isKnown(const Value *V) const {
+ if (const Instruction *I = dyn_cast<Instruction>(V))
+ return isKnown(I);
+ return false;
+ }
+
+ /// Adds an instruction to the known set and re-evaluates unknown instructions
+ /// in the loop to determine whether their result can now be known.
+ void addToKnown(const Instruction *I) {
+ if (!KnownVals.insert(I).second)
+ return;
+
+ // Every time we assume knowledge of an additional instruction result, we
+ // potentially need to revisit instructions that were previously seen as
+ // unoptimizable.
+ Evaluated.clear();
+
+ addUsersToExploreSet(I);
+ while (ToEvaluate.size()) {
+ const Instruction *I = ToEvaluate.back();
+ ToEvaluate.pop_back();
+ evalInstruction(I);
+ }
+ }
+
+ /// Returns savings incurred by all known instructions, according to the \p
+ /// TTI.
+ InstructionCost computeSavings(const TargetTransformInfo &TTI) const {
+ TargetTransformInfo::TargetCostKind CostKind =
+ L->getHeader()->getParent()->hasMinSize()
+ ? TargetTransformInfo::TCK_CodeSize
+ : TargetTransformInfo::TCK_SizeAndLatency;
+
+ InstructionCost CostSavings;
+ for (const Value *Val : KnownVals)
+ CostSavings += TTI.getInstructionCost(cast<Instruction>(Val), CostKind);
+ return CostSavings;
+ }
+
+private:
+ /// The set of instruction inside the loop whose results are considered known.
+ SmallPtrSet<const Instruction *, 4> KnownVals;
+ /// Caches the set of instructions we have already evaluated when adding a new
+ /// instruction to the known set.
+ SmallPtrSet<const Instruction *, 4> Evaluated;
+ /// Stack of instructions to evaluate when adding a new instruction to the
+ /// known set.
+ SmallVector<const Instruction *, 4> ToEvaluate;
+ /// The loop under consideration.
+ const Loop *L;
+
+ /// Adds all value users to the stack of instructions to evaluate, if they
+ /// have not been evaluated already.
+ void addUsersToExploreSet(const Value *Val) {
+ for (const User *U : Val->users()) {
+ if (const Instruction *I = dyn_cast<Instruction>(U))
+ if (!Evaluated.contains(I))
+ ToEvaluate.push_back(I);
+ }
+ }
+
+ /// Evaluates an instruction to determine whether its result is "known", and
+ /// returns if that is the case. This may recurse on operands that are the
+ /// resul of yet unevaluated instructions inside the loop.
+ bool evalInstruction(const Instruction *I) {
+ Evaluated.insert(I);
+ if (isKnown(I))
+ return true;
+ if (!isa<BinaryOperator, CastInst, CmpInst>(I))
+ return false;
+ bool Known = llvm::all_of(I->operand_values(), [&](const Value *Val) {
+ if (isa<Constant>(Val) || isKnown(Val))
+ return true;
+ const Instruction *ValInstr = dyn_cast<Instruction>(Val);
+ if (!ValInstr || Evaluated.contains(ValInstr) || !L->contains(ValInstr))
+ return false;
+ return evalInstruction(ValInstr);
+ });
+ if (Known) {
+ KnownVals.insert(I);
+ addUsersToExploreSet(I);
+ }
+ return Known;
+ }
+};
+
} // end anonymous namespace
+/// Runs a fast analysis on the loop to determine whether it is worth it to
+/// fully unroll it. As opposed to analyzeLoopUnrollCost, this does not attempt
+/// to simulate execution of every loop iteration but instead tries to identify
+/// the set of instructions that will be optimizable away if the loop is fully
+/// unrolled. Returns estimated instruction cost savings per loop iteration if
+/// the loop were to be fully unrolled according to the trip count in UP.Count.
+static InstructionCost analyzeFullUnrollCostSavings(
+ const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
+ const TargetTransformInfo::UnrollingPreferences &UP) {
+ // Cost savings analysis is all based on unrolling making some values
+ // statically known; if we cannot identify the loop's IV then there is nothing
+ // we can do.
+ PHINode *IV = L->getInductionVariable(SE);
+ if (!IV)
+ return {};
+ FullUnrollCostSavings Savings(L);
+
+ // If we were to unroll the loop, everything that is only dependent on the IV
+ // and constants will get simplified away.
+ Savings.addToKnown(IV);
+
+ // Look for subloops whose trip count would go from runtime-dependent to
+ // runtime-independent if we were to unroll the loop. These subloops are
+ // likely to be fully unrollable in the future and yield further cost savings.
+ unsigned NumUnrollableSubloops = 0;
+ for (const Loop *SubLoop : L->getSubLoops()) {
+ // We must be able to determine the loop's IV, initial/final IV value, and
+ // step.
+ PHINode *SubIV = SubLoop->getInductionVariable(SE);
+ if (!SubIV)
+ continue;
+ std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
+ if (!Bounds)
+ continue;
+ Value *StepVal = Bounds->getStepValue();
+ if (!StepVal)
+ continue;
+
+ bool SubBoundsDependsOnIV = false;
+ auto IsValKnown = [&](const Value *Val) -> bool {
+ if (isa<Constant>(Val))
+ return true;
+ if (Savings.isKnown(Val)) {
+ SubBoundsDependsOnIV = true;
+ return true;
+ }
+ return false;
+ };
+
+ // Determine whether the derivation of the subloop's bounds depends
+ // exclusively on constants and the outer loop's IV.
+ if (IsValKnown(&Bounds->getInitialIVValue()) &&
+ IsValKnown(&Bounds->getFinalIVValue()) && IsValKnown(StepVal) &&
+ SubBoundsDependsOnIV) {
+ // Optimistically assume that we will be able to unroll the subloop in the
+ // future, which means that its IV will also be known on all inner loop
+ // iterations, leading to more instructions being optimized away. Properly
+ // estimating the cost savings per outer loop iteration would require us
+ // to estimate the average subloop trip count, but it is too complicated
+ // for this analysis. When determining cost savings, we will very
+ // conservatively assume that the inner loop will only execute once per
+ // outer loop iteration. This also reduces our cost savings estimation
+ // mistake in the case where the subloop does not end up being unrolled.
+ Savings.addToKnown(SubIV);
+ ++NumUnrollableSubloops;
+
+ LLVM_DEBUG(
+ dbgs() << " Trip count of subloop %"
+ << SubLoop->getHeader()->getName()
+ << " will become runtime-independent by fully unrolling loop %"
+ << L->getHeader()->getName() << "\n");
+ }
+ }
+
+ // Look for condititional branches whose condition would be statically
+ // determined at each iteration of the loop if it were unrolled. In some
+ // cases, this means we will able to remove the branch entirely.
+ for (const BasicBlock *BB : L->getBlocks()) {
+ const Instruction *TermInstr = BB->getTerminator();
+ if (const BranchInst *Br = dyn_cast<BranchInst>(TermInstr)) {
+ if (Br->isConditional() && Savings.isKnown(Br->getCondition())) {
+ // The branch condition will be statically determined at each iteration
+ // of the loop.
+ BasicBlock *FalseSucc = Br->getSuccessor(0),
+ *TrueSucc = Br->getSuccessor(1);
+
+ // Checks whether one of the branch successor has at most two
+ // predecessors which are either the branch's block or the other branch
+ // successor.
+ auto IsIfThen = [&](auto Predecessors, BasicBlock *OtherSucc) -> bool {
+ unsigned NumPreds = 0;
+ for (const BasicBlock *Pred : Predecessors) {
+ if (Pred != BB && Pred != OtherSucc)
+ return false;
+ if (++NumPreds > 2)
+ return false;
+ }
+ return true;
+ };
+
+ if ((TrueSucc->getSinglePredecessor() ||
+ IsIfThen(predecessors(TrueSucc), FalseSucc)) &&
+ (FalseSucc->getSinglePredecessor() ||
+ IsIfThen(predecessors(FalseSucc), TrueSucc))) {
+ // The CFG corresponds to a simple if/then(/else) construct whose
+ // condition we will know, so we will able to remove the branch and
+ // one of the two blocks at each iteration of the outer loop. Only the
+ // branch represents a cost saving, since one successor block will
+ // still be executed.
+ Savings.addToKnown(Br);
+ LLVM_DEBUG(dbgs() << " Conditional branch will be removed by fully "
+ "unrolling loop %"
+ << L->getHeader()->getName() << "\n");
----------------
arsenm wrote:
```suggestion
"unrolling loop "
<< printAsOperand(L->getHeader(), false) << '\n');
```
https://github.com/llvm/llvm-project/pull/114579
More information about the llvm-commits
mailing list