[llvm] [LoopUnroll] Structural cost savings analysis for full loop unrolling (PR #114579)

Fri Dec 20 04:58:36 PST 2024

================
@@ -337,8 +336,239 @@ struct PragmaInfo {
   const bool PragmaEnableUnroll;
 };
 
+/// Helper type to estimate per-iteration cost savings coming from fully
+/// unrolling a loop.
+///
+/// The analysis maintains a set of "known instructions" inside the loop (i.e.,
+/// instructions whose result will be statically known after loop unrolling)
+/// that we assume will be entirely removable if the loop is fully unrolled.
+/// These instructions' cost can be deducted from the unrolled cost when
+/// comparing against a threshold.
+struct FullUnrollCostSavings {
+  FullUnrollCostSavings(const Loop *L) : L(L) {}
+
+  /// Returns whether the instruction is known.
+  inline bool isKnown(const Instruction *I) const {
+    return KnownVals.contains(I);
+  }
+
+  /// If the value is an instruction, returns whether that instruction is known,
+  /// false otherwise.
+  bool isKnown(const Value *V) const {
+    if (const Instruction *I = dyn_cast<Instruction>(V))
+      return isKnown(I);
+    return false;
+  }
+
+  /// Adds an instruction to the known set and re-evaluates unknown instructions
+  /// in the loop to determine whether their result can now be known.
+  void addToKnown(const Instruction *I) {
+    if (!KnownVals.insert(I).second)
+      return;
+
+    // Every time we assume knowledge of an additional instruction result, we
+    // potentially need to revisit instructions that were previously seen as
+    // unoptimizable.
+    Evaluated.clear();
+
+    addUsersToExploreSet(I);
+    while (ToEvaluate.size()) {
+      const Instruction *I = ToEvaluate.back();
+      ToEvaluate.pop_back();
+      evalInstruction(I);
+    }
+  }
+
+  /// Returns savings incurred by all known instructions, according to the \p
+  /// TTI.
+  InstructionCost computeSavings(const TargetTransformInfo &TTI) const {
+    TargetTransformInfo::TargetCostKind CostKind =
+        L->getHeader()->getParent()->hasMinSize()
+            ? TargetTransformInfo::TCK_CodeSize
+            : TargetTransformInfo::TCK_SizeAndLatency;
+
+    InstructionCost CostSavings;
+    for (const Value *Val : KnownVals)
+      CostSavings += TTI.getInstructionCost(cast<Instruction>(Val), CostKind);
+    return CostSavings;
+  }
+
+private:
+  /// The set of instruction inside the loop whose results are considered known.
+  SmallPtrSet<const Instruction *, 4> KnownVals;
+  /// Caches the set of instructions we have already evaluated when adding a new
+  /// instruction to the known set.
+  SmallPtrSet<const Instruction *, 4> Evaluated;
+  /// Stack of instructions to evaluate when adding a new instruction to the
+  /// known set.
+  SmallVector<const Instruction *, 4> ToEvaluate;
+  /// The loop under consideration.
+  const Loop *L;
+
+  /// Adds all value users to the stack of instructions to evaluate, if they
+  /// have not been evaluated already.
+  void addUsersToExploreSet(const Value *Val) {
+    for (const User *U : Val->users()) {
+      if (const Instruction *I = dyn_cast<Instruction>(U))
+        if (!Evaluated.contains(I))
+          ToEvaluate.push_back(I);
+    }
+  }
+
+  /// Evaluates an instruction to determine whether its result is "known", and
+  /// returns if that is the case. This may recurse on operands that are the
+  /// resul of yet unevaluated instructions inside the loop.
+  bool evalInstruction(const Instruction *I) {
+    Evaluated.insert(I);
+    if (isKnown(I))
+      return true;
+    if (!isa<BinaryOperator, CastInst, CmpInst>(I))
+      return false;
+    bool Known = llvm::all_of(I->operand_values(), [&](const Value *Val) {
+      if (isa<Constant>(Val) || isKnown(Val))
+        return true;
+      const Instruction *ValInstr = dyn_cast<Instruction>(Val);
+      if (!ValInstr || Evaluated.contains(ValInstr) || !L->contains(ValInstr))
+        return false;
+      return evalInstruction(ValInstr);
+    });
+    if (Known) {
+      KnownVals.insert(I);
+      addUsersToExploreSet(I);
+    }
+    return Known;
+  }
+};
+
 } // end anonymous namespace
 
+/// Runs a fast analysis on the loop to determine whether it is worth it to
+/// fully unroll it. As opposed to analyzeLoopUnrollCost, this does not attempt
+/// to simulate execution of every loop iteration but instead tries to identify
+/// the set of instructions that will be optimizable away if the loop is fully
+/// unrolled. Returns estimated instruction cost savings per loop iteration if
+/// the loop were to be fully unrolled according to the trip count in UP.Count.
+static InstructionCost analyzeFullUnrollCostSavings(
+    const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
+    const TargetTransformInfo::UnrollingPreferences &UP) {
+  // Cost savings analysis is all based on unrolling making some values
+  // statically known; if we cannot identify the loop's IV then there is nothing
+  // we can do.
+  PHINode *IV = L->getInductionVariable(SE);
+  if (!IV)
+    return {};
+  FullUnrollCostSavings Savings(L);
+
+  // If we were to unroll the loop, everything that is only dependent on the IV
+  // and constants will get simplified away.
+  Savings.addToKnown(IV);
+
+  // Look for subloops whose trip count would go from runtime-dependent to
+  // runtime-independent if we were to unroll the loop. These subloops are
+  // likely to be fully unrollable in the future and yield further cost savings.
+  unsigned NumUnrollableSubloops = 0;
+  for (const Loop *SubLoop : L->getSubLoops()) {
+    // We must be able to determine the loop's IV, initial/final IV value, and
+    // step.
+    PHINode *SubIV = SubLoop->getInductionVariable(SE);
+    if (!SubIV)
+      continue;
+    std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
+    if (!Bounds)
+      continue;
+    Value *StepVal = Bounds->getStepValue();
+    if (!StepVal)
+      continue;
+
+    bool SubBoundsDependsOnIV = false;
+    auto IsValKnown = [&](const Value *Val) -> bool {
+      if (isa<Constant>(Val))
+        return true;
+      if (Savings.isKnown(Val)) {
+        SubBoundsDependsOnIV = true;
+        return true;
+      }
+      return false;
+    };
+
+    // Determine whether the derivation of the subloop's bounds depends
+    // exclusively on constants and the outer loop's IV.
+    if (IsValKnown(&Bounds->getInitialIVValue()) &&
+        IsValKnown(&Bounds->getFinalIVValue()) && IsValKnown(StepVal) &&
+        SubBoundsDependsOnIV) {
+      // Optimistically assume that we will be able to unroll the subloop in the
+      // future, which means that its IV will also be known on all inner loop
+      // iterations, leading to more instructions being optimized away. Properly
+      // estimating the cost savings per outer loop iteration would require us
+      // to estimate the average subloop trip count, but it is too complicated
+      // for this analysis. When determining cost savings, we will very
+      // conservatively assume that the inner loop will only execute once per
+      // outer loop iteration. This also reduces our cost savings estimation
+      // mistake in the case where the subloop does not end up being unrolled.
+      Savings.addToKnown(SubIV);
+      ++NumUnrollableSubloops;
+
+      LLVM_DEBUG(
+          dbgs() << "  Trip count of subloop %"
+                 << SubLoop->getHeader()->getName()
+                 << " will become runtime-independent by fully unrolling loop %"
+                 << L->getHeader()->getName() << "\n");
+    }
+  }
+
+  // Look for condititional branches whose condition would be statically
+  // determined at each iteration of the loop if it were unrolled. In some
+  // cases, this means we will able to remove the branch entirely.
+  for (const BasicBlock *BB : L->getBlocks()) {
+    const Instruction *TermInstr = BB->getTerminator();
+    if (const BranchInst *Br = dyn_cast<BranchInst>(TermInstr)) {
+      if (Br->isConditional() && Savings.isKnown(Br->getCondition())) {
+        // The branch condition will be statically determined at each iteration
+        // of the loop.
+        BasicBlock *FalseSucc = Br->getSuccessor(0),
+                   *TrueSucc = Br->getSuccessor(1);
+
+        // Checks whether one of the branch successor has at most two
+        // predecessors which are either the branch's block or the other branch
+        // successor.
+        auto IsIfThen = [&](auto Predecessors, BasicBlock *OtherSucc) -> bool {
+          unsigned NumPreds = 0;
+          for (const BasicBlock *Pred : Predecessors) {
+            if (Pred != BB && Pred != OtherSucc)
+              return false;
+            if (++NumPreds > 2)
+              return false;
+          }
+          return true;
+        };
+
+        if ((TrueSucc->getSinglePredecessor() ||
+             IsIfThen(predecessors(TrueSucc), FalseSucc)) &&
+            (FalseSucc->getSinglePredecessor() ||
+             IsIfThen(predecessors(FalseSucc), TrueSucc))) {
+          // The CFG corresponds to a simple if/then(/else) construct whose
+          // condition we will know, so we will able to remove the branch and
+          // one of the two blocks at each iteration of the outer loop. Only the
+          // branch represents a cost saving, since one successor block will
+          // still be executed.
+          Savings.addToKnown(Br);
+          LLVM_DEBUG(dbgs() << "  Conditional branch will be removed by fully "
+                               "unrolling loop %"
+                            << L->getHeader()->getName() << "\n");
----------------
arsenm wrote:

```suggestion
                               "unrolling loop "
                            << printAsOperand(L->getHeader(), false) << '\n');
```

https://github.com/llvm/llvm-project/pull/114579