[llvm] d7ebb74 - [SelectOpti][4/5] Loop Heuristics
Sotiris Apostolakis via llvm-commits
llvm-commits at lists.llvm.org
Mon May 23 20:03:24 PDT 2022
Author: Sotiris Apostolakis
Date: 2022-05-23T22:05:41-04:00
New Revision: d7ebb7461151fdb8dcf00a9a01d484355a0c6074
URL: https://github.com/llvm/llvm-project/commit/d7ebb7461151fdb8dcf00a9a01d484355a0c6074
DIFF: https://github.com/llvm/llvm-project/commit/d7ebb7461151fdb8dcf00a9a01d484355a0c6074.diff
LOG: [SelectOpti][4/5] Loop Heuristics
This patch adds the loop-level heuristics for determining whether branches are more profitable than conditional moves.
These heuristics apply to only inner-most loops.
Depends on D120231
Reviewed By: davidxl
Differential Revision: https://reviews.llvm.org/D120232
Added:
Modified:
llvm/lib/CodeGen/SelectOptimize.cpp
llvm/test/CodeGen/X86/select-optimize.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 5cf937335df9..2194da77e0b4 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
@@ -30,6 +31,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/ScaledNumber.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include <algorithm>
@@ -52,6 +54,8 @@ STATISTIC(NumSelectUnPred,
"Number of select groups not converted due to unpredictability");
STATISTIC(NumSelectColdBB,
"Number of select groups not converted due to cold basic block");
+STATISTIC(NumSelectConvertedLoop,
+ "Number of select groups converted due to loop-level analysis");
STATISTIC(NumSelectsConverted, "Number of selects converted");
static cl::opt<unsigned> ColdOperandThreshold(
@@ -65,6 +69,31 @@ static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
"slice of a cold operand to be considered inexpensive."),
cl::init(1), cl::Hidden);
+static cl::opt<unsigned>
+ GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
+ cl::desc("Gradient gain threshold (%)."),
+ cl::init(25), cl::Hidden);
+
+static cl::opt<unsigned>
+ GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
+ cl::desc("Minimum gain per loop (in cycles) threshold."),
+ cl::init(4), cl::Hidden);
+
+static cl::opt<unsigned> GainRelativeThreshold(
+ "select-opti-loop-relative-gain-threshold",
+ cl::desc(
+ "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
+ cl::init(8), cl::Hidden);
+
+static cl::opt<unsigned> MispredictDefaultRate(
+ "mispredict-default-rate", cl::Hidden, cl::init(25),
+ cl::desc("Default mispredict rate (initialized to 25%)."));
+
+static cl::opt<bool>
+ DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
+ cl::init(false),
+ cl::desc("Disable loop-level heuristics."));
+
namespace {
class SelectOptimize : public FunctionPass {
@@ -78,6 +107,7 @@ class SelectOptimize : public FunctionPass {
std::unique_ptr<BranchProbabilityInfo> BPI;
ProfileSummaryInfo *PSI;
OptimizationRemarkEmitter *ORE;
+ TargetSchedModel TSchedModel;
public:
static char ID;
@@ -103,6 +133,15 @@ class SelectOptimize : public FunctionPass {
using SelectGroup = SmallVector<SelectInst *, 2>;
using SelectGroups = SmallVector<SelectGroup, 2>;
+ using Scaled64 = ScaledNumber<uint64_t>;
+
+ struct CostInfo {
+ /// Predicated cost (with selects as conditional moves).
+ Scaled64 PredCost;
+ /// Non-predicated cost (with selects converted to branches).
+ Scaled64 NonPredCost;
+ };
+
// Converts select instructions of a function to conditional jumps when deemed
// profitable. Returns true if at least one select was converted.
bool optimizeSelects(Function &F);
@@ -122,9 +161,12 @@ class SelectOptimize : public FunctionPass {
void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
// Determines for which select groups it is profitable converting to branches
- // (base heuristics).
+ // (base and inner-most-loop heuristics).
void findProfitableSIGroupsBase(SelectGroups &SIGroups,
SelectGroups &ProfSIGroups);
+ void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
+ SelectGroups &ProfSIGroups);
+
// Determines if a select group should be converted to a branch (base
// heuristics).
bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
@@ -143,6 +185,29 @@ class SelectOptimize : public FunctionPass {
// Returns true if the condition of the select is highly predictable.
bool isSelectHighlyPredictable(const SelectInst *SI);
+ // Loop-level checks to determine if a non-predicated version (with branches)
+ // of the given loop is more profitable than its predicated version.
+ bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
+
+ // Computes instruction and loop-critical-path costs for both the predicated
+ // and non-predicated version of the given loop.
+ bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
+ DenseMap<const Instruction *, CostInfo> &InstCostMap,
+ CostInfo *LoopCost);
+
+ // Returns a set of all the select instructions in the given select groups.
+ SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups);
+
+ // Returns the latency cost of a given instruction.
+ Optional<uint64_t> computeInstCost(const Instruction *I);
+
+ // Returns the misprediction cost of a given select when converted to branch.
+ Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost);
+
+ // Returns the cost of a branch when the prediction is correct.
+ Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
+ const SelectInst *SI);
+
// Returns true if the target architecture supports lowering a given select.
bool isSelectKindSupported(SelectInst *SI);
};
@@ -183,6 +248,7 @@ bool SelectOptimize::runOnFunction(Function &F) {
BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ TSchedModel.init(TSI);
// When optimizing for size, selects are preferable over branches.
if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get()))
@@ -224,7 +290,24 @@ void SelectOptimize::optimizeSelectsBase(Function &F,
}
void SelectOptimize::optimizeSelectsInnerLoops(Function &F,
- SelectGroups &ProfSIGroups) {}
+ SelectGroups &ProfSIGroups) {
+ SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
+ // Need to check size on each iteration as we accumulate child loops.
+ for (unsigned long i = 0; i < Loops.size(); ++i)
+ for (Loop *ChildL : Loops[i]->getSubLoops())
+ Loops.push_back(ChildL);
+
+ for (Loop *L : Loops) {
+ if (!L->isInnermost())
+ continue;
+
+ SelectGroups SIGroups;
+ for (BasicBlock *BB : L->getBlocks())
+ collectSelectGroups(*BB, SIGroups);
+
+ findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
+ }
+}
/// If \p isTrue is true, return the true value of \p SI, otherwise return
/// false value of \p SI. If the true/false value of \p SI is defined by any
@@ -376,6 +459,53 @@ void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups,
}
}
+void SelectOptimize::findProfitableSIGroupsInnerLoops(
+ const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
+ NumSelectOptAnalyzed += SIGroups.size();
+ // For each select group in an inner-most loop,
+ // a branch is more preferable than a select/conditional-move if:
+ // i) conversion to branches for all the select groups of the loop satisfies
+ // loop-level heuristics including reducing the loop's critical path by
+ // some threshold (see SelectOptimize::checkLoopHeuristics); and
+ // ii) the total cost of the select group is cheaper with a branch compared
+ // to its predicated version. The cost is in terms of latency and the cost
+ // of a select group is the cost of its most expensive select instruction
+ // (assuming infinite resources and thus fully leveraging available ILP).
+
+ DenseMap<const Instruction *, CostInfo> InstCostMap;
+ CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
+ {Scaled64::getZero(), Scaled64::getZero()}};
+ if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
+ !checkLoopHeuristics(L, LoopCost)) {
+ return;
+ }
+
+ for (SelectGroup &ASI : SIGroups) {
+ // Assuming infinite resources, the cost of a group of instructions is the
+ // cost of the most expensive instruction of the group.
+ Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
+ for (SelectInst *SI : ASI) {
+ SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost);
+ BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost);
+ }
+ if (BranchCost < SelectCost) {
+ OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front());
+ OR << "Profitable to convert to branch (loop analysis). BranchCost="
+ << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
+ << ". ";
+ ORE->emit(OR);
+ ++NumSelectConvertedLoop;
+ ProfSIGroups.push_back(ASI);
+ } else {
+ OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
+ ORmiss << "Select is more profitable (loop analysis). BranchCost="
+ << BranchCost.toString()
+ << ", SelectCost=" << SelectCost.toString() << ". ";
+ ORE->emit(ORmiss);
+ }
+ }
+}
+
bool SelectOptimize::isConvertToBranchProfitableBase(
const SmallVector<SelectInst *, 2> &ASI) {
SelectInst *SI = ASI.front();
@@ -529,6 +659,220 @@ bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) {
return false;
}
+bool SelectOptimize::checkLoopHeuristics(const Loop *L,
+ const CostInfo LoopCost[2]) {
+ // Loop-level checks to determine if a non-predicated version (with branches)
+ // of the loop is more profitable than its predicated version.
+
+ if (DisableLoopLevelHeuristics)
+ return true;
+
+ OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
+ L->getHeader()->getFirstNonPHI());
+
+ if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
+ LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
+ ORmissL << "No select conversion in the loop due to no reduction of loop's "
+ "critical path. ";
+ ORE->emit(ORmissL);
+ return false;
+ }
+
+ Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
+ LoopCost[1].PredCost - LoopCost[1].NonPredCost};
+
+ // Profitably converting to branches need to reduce the loop's critical path
+ // by at least some threshold (absolute gain of GainCycleThreshold cycles and
+ // relative gain of 12.5%).
+ if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
+ Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
+ Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
+ ORmissL << "No select conversion in the loop due to small reduction of "
+ "loop's critical path. Gain="
+ << Gain[1].toString()
+ << ", RelativeGain=" << RelativeGain.toString() << "%. ";
+ ORE->emit(ORmissL);
+ return false;
+ }
+
+ // If the loop's critical path involves loop-carried dependences, the gradient
+ // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
+ // This check ensures that the latency reduction for the loop's critical path
+ // keeps decreasing with sufficient rate beyond the two analyzed loop
+ // iterations.
+ if (Gain[1] > Gain[0]) {
+ Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
+ (LoopCost[1].PredCost - LoopCost[0].PredCost);
+ if (GradientGain < Scaled64::get(GainGradientThreshold)) {
+ ORmissL << "No select conversion in the loop due to small gradient gain. "
+ "GradientGain="
+ << GradientGain.toString() << "%. ";
+ ORE->emit(ORmissL);
+ return false;
+ }
+ }
+ // If the gain decreases it is not profitable to convert.
+ else if (Gain[1] < Gain[0]) {
+ ORmissL
+ << "No select conversion in the loop due to negative gradient gain. ";
+ ORE->emit(ORmissL);
+ return false;
+ }
+
+ // Non-predicated version of the loop is more profitable than its
+ // predicated version.
+ return true;
+}
+
+// Computes instruction and loop-critical-path costs for both the predicated
+// and non-predicated version of the given loop.
+// Returns false if unable to compute these costs due to invalid cost of loop
+// instruction(s).
+bool SelectOptimize::computeLoopCosts(
+ const Loop *L, const SelectGroups &SIGroups,
+ DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
+ const auto &SIset = getSIset(SIGroups);
+ // Compute instruction and loop-critical-path costs across two iterations for
+ // both predicated and non-predicated version.
+ const unsigned Iterations = 2;
+ for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
+ // Cost of the loop's critical path.
+ CostInfo &MaxCost = LoopCost[Iter];
+ for (BasicBlock *BB : L->getBlocks()) {
+ for (const Instruction &I : *BB) {
+ if (I.isDebugOrPseudoInst())
+ continue;
+ // Compute the predicated and non-predicated cost of the instruction.
+ Scaled64 IPredCost = Scaled64::getZero(),
+ INonPredCost = Scaled64::getZero();
+
+ // Assume infinite resources that allow to fully exploit the available
+ // instruction-level parallelism.
+ // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
+ for (const Use &U : I.operands()) {
+ auto UI = dyn_cast<Instruction>(U.get());
+ if (!UI)
+ continue;
+ if (InstCostMap.count(UI)) {
+ IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
+ INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
+ }
+ }
+ auto ILatency = computeInstCost(&I);
+ if (!ILatency.hasValue()) {
+ OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
+ ORmissL << "Invalid instruction cost preventing analysis and "
+ "optimization of the inner-most loop containing this "
+ "instruction. ";
+ ORE->emit(ORmissL);
+ return false;
+ }
+ IPredCost += Scaled64::get(ILatency.getValue());
+ INonPredCost += Scaled64::get(ILatency.getValue());
+
+ // For a select that can be converted to branch,
+ // compute its cost as a branch (non-predicated cost).
+ //
+ // BranchCost = PredictedPathCost + MispredictCost
+ // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
+ // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
+ if (SIset.contains(&I)) {
+ auto SI = dyn_cast<SelectInst>(&I);
+
+ Scaled64 TrueOpCost = Scaled64::getZero(),
+ FalseOpCost = Scaled64::getZero();
+ if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue()))
+ if (InstCostMap.count(TI))
+ TrueOpCost = InstCostMap[TI].NonPredCost;
+ if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue()))
+ if (InstCostMap.count(FI))
+ FalseOpCost = InstCostMap[FI].NonPredCost;
+ Scaled64 PredictedPathCost =
+ getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
+
+ Scaled64 CondCost = Scaled64::getZero();
+ if (auto *CI = dyn_cast<Instruction>(SI->getCondition()))
+ if (InstCostMap.count(CI))
+ CondCost = InstCostMap[CI].NonPredCost;
+ Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
+
+ INonPredCost = PredictedPathCost + MispredictCost;
+ }
+
+ InstCostMap[&I] = {IPredCost, INonPredCost};
+ MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
+ MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
+ }
+ }
+ }
+ return true;
+}
+
+SmallPtrSet<const Instruction *, 2>
+SelectOptimize::getSIset(const SelectGroups &SIGroups) {
+ SmallPtrSet<const Instruction *, 2> SIset;
+ for (const SelectGroup &ASI : SIGroups)
+ for (const SelectInst *SI : ASI)
+ SIset.insert(SI);
+ return SIset;
+}
+
+Optional<uint64_t> SelectOptimize::computeInstCost(const Instruction *I) {
+ InstructionCost ICost =
+ TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+ if (auto OC = ICost.getValue())
+ return Optional<uint64_t>(OC.getValue());
+ return Optional<uint64_t>(None);
+}
+
+ScaledNumber<uint64_t>
+SelectOptimize::getMispredictionCost(const SelectInst *SI,
+ const Scaled64 CondCost) {
+ uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
+
+ // Account for the default misprediction rate when using a branch
+ // (conservatively set to 25% by default).
+ uint64_t MispredictRate = MispredictDefaultRate;
+ // If the select condition is obviously predictable, then the misprediction
+ // rate is zero.
+ if (isSelectHighlyPredictable(SI))
+ MispredictRate = 0;
+
+ // CondCost is included to account for cases where the computation of the
+ // condition is part of a long dependence chain (potentially loop-carried)
+ // that would delay detection of a misprediction and increase its cost.
+ Scaled64 MispredictCost =
+ std::max(Scaled64::get(MispredictPenalty), CondCost) *
+ Scaled64::get(MispredictRate);
+ MispredictCost /= Scaled64::get(100);
+
+ return MispredictCost;
+}
+
+// Returns the cost of a branch when the prediction is correct.
+// TrueCost * TrueProbability + FalseCost * FalseProbability.
+ScaledNumber<uint64_t>
+SelectOptimize::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
+ const SelectInst *SI) {
+ Scaled64 PredPathCost;
+ uint64_t TrueWeight, FalseWeight;
+ if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
+ uint64_t SumWeight = TrueWeight + FalseWeight;
+ if (SumWeight != 0) {
+ PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
+ FalseCost * Scaled64::get(FalseWeight);
+ PredPathCost /= Scaled64::get(SumWeight);
+ return PredPathCost;
+ }
+ }
+ // Without branch weight metadata, we assume 75% for the one path and 25% for
+ // the other, and pick the result with the biggest cost.
+ PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
+ FalseCost * Scaled64::get(3) + TrueCost);
+ PredPathCost /= Scaled64::get(4);
+ return PredPathCost;
+}
+
bool SelectOptimize::isSelectKindSupported(SelectInst *SI) {
bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
if (VectorCond)
diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
index 99a194f033bb..ee267c80e7a5 100644
--- a/llvm/test/CodeGen/X86/select-optimize.ll
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -211,9 +211,301 @@ define i32 @expensive_val_operand4(i32 %a, i32* nocapture %b, i32 %x, i1 %cmp) {
ret i32 %add
}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Test loop heuristic: loop-level critical-path analysis
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Use of cmov in this test would put a load and a fsub on the critical path.
+;; Loop-level analysis should decide to form a branch.
+;;
+;;double cmov_on_critical_path(int n, double x, double *a) {
+;; for (int i = 0; i < n; i++) {
+;; double r = a[i];
+;; if (x > r)
+;; // 50% of iterations
+;; x -= r;
+;; }
+;; return x;
+;;}
+define double @cmov_on_critical_path(i32 %n, double %x, double* nocapture %a) {
+; CHECK-LABEL: @cmov_on_critical_path(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret double [[X:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[SELECT_END:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[X1:%.*]] = phi double [ [[X2:%.*]], [[SELECT_END]] ], [ [[X]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[R:%.*]] = load double, double* [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]]
+; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt double [[X1]], [[R]]
+; CHECK-NEXT: [[X2_FROZEN:%.*]] = freeze i1 [[CMP2]]
+; CHECK-NEXT: br i1 [[X2_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]], !prof [[PROF27:![0-9]+]]
+; CHECK: select.false:
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[X2]] = phi double [ [[SUB]], [[FOR_BODY]] ], [ [[X1]], [[SELECT_FALSE]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+; CHECK: for.exit:
+; CHECK-NEXT: ret double [[X2]]
+;
+entry:
+ %cmp1 = icmp sgt i32 %n, 0
+ br i1 %cmp1, label %for.body.preheader, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %entry
+ ret double %x
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %x1 = phi double [ %x2, %for.body ], [ %x, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+ %r = load double, double* %arrayidx, align 8
+ %sub = fsub double %x1, %r
+ %cmp2 = fcmp ogt double %x1, %r
+ %x2 = select i1 %cmp2, double %sub, double %x1, !prof !18
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret double %x2
+}
+
+;; The common path includes expensive operations (load and fsub) making
+;; branch similarly expensive to cmov, and thus the gain is small.
+;; Loop-level analysis should decide on not forming a branch.
+;;
+;;double small_gain(int n, double x, double *a) {
+;; for (int i = 0; i < n; i++) {
+;; double r = a[i];
+;; if (x > r)
+;; // 99% of iterations
+;; x -= r;
+;; }
+;; return x;
+;;}
+define double @small_gain(i32 %n, double %x, double* nocapture %a) {
+; CHECK-LABEL: @small_gain(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret double [[X:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[X1:%.*]] = phi double [ [[X2:%.*]], [[FOR_BODY]] ], [ [[X]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[R:%.*]] = load double, double* [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]]
+; CHECK-NEXT: [[CMP2:%.*]] = fcmp ole double [[X1]], [[R]]
+; CHECK-NEXT: [[X2]] = select i1 [[CMP2]], double [[X1]], double [[SUB]], !prof [[PROF18]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+; CHECK: for.exit:
+; CHECK-NEXT: ret double [[X2]]
+;
+entry:
+ %cmp1 = icmp sgt i32 %n, 0
+ br i1 %cmp1, label %for.body.preheader, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %entry
+ ret double %x
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %x1 = phi double [ %x2, %for.body ], [ %x, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+ %r = load double, double* %arrayidx, align 8
+ %sub = fsub double %x1, %r
+ %cmp2 = fcmp ole double %x1, %r
+ %x2 = select i1 %cmp2, double %x1, double %sub, !prof !17
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit: ; preds = %for.body
+ ret double %x2
+}
+
+;; Use of a branch in this test would avoid executing a load and several
+;; floating-point operations for most cases (70% of the time).
+;; Yet, the gain is not increasing much per iteration (small gradient gain).
+;; Loop-level analysis should decide not to form a branch.
+;;
+;;double small_gradient(int n, double x, double *a) {
+;; for (int i = 0; i < n; i++) {
+;; double r = 2 * a[i] + i;
+;; if (r > 0)
+;; // 30% of iterations
+;; x -= r;
+;; }
+;; return x;
+;;}
+define double @small_gradient(i32 %n, double %x, double* nocapture %a) {
+; CHECK-LABEL: @small_gradient(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[X_ADDR_0_LCSSA:%.*]] = phi double [ [[X:%.*]], [[ENTRY:%.*]] ], [ [[X_ADDR_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: ret double [[X_ADDR_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[X_ADDR_010:%.*]] = phi double [ [[X]], [[FOR_BODY_PREHEADER]] ], [ [[X_ADDR_1]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.fmuladd.f64(double [[TMP0]], double 2.000000e+00, double 1.000000e+00)
+; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt double [[TMP1]], 0.000000e+00
+; CHECK-NEXT: [[SUB:%.*]] = select i1 [[CMP1]], double [[TMP1]], double 0.000000e+00, !prof [[PROF28:![0-9]+]]
+; CHECK-NEXT: [[X_ADDR_1]] = fsub double [[X_ADDR_010]], [[SUB]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+;
+entry:
+ %cmp8 = icmp sgt i32 %n, 0
+ br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %x.addr.0.lcssa = phi double [ %x, %entry ], [ %x.addr.1, %for.body ]
+ ret double %x.addr.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %x.addr.010 = phi double [ %x, %for.body.preheader ], [ %x.addr.1, %for.body ]
+ %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+ %0 = load double, double* %arrayidx, align 8
+ %1 = call double @llvm.fmuladd.f64(double %0, double 2.000000e+00, double 1.000000e+00)
+ %cmp1 = fcmp ogt double %1, 0.000000e+00
+ %sub = select i1 %cmp1, double %1, double 0.000000e+00, !prof !28
+ %x.addr.1 = fsub double %x.addr.010, %sub
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+;; One select on the critical path and one off the critical path.
+;; Loop-level analysis should decide to form a branch only for
+;; the select on the critical path.
+;;
+;;double loop_select_groups(int n, double x, double *a, int k) {
+;; int c = 0;
+;; for (int i = 0; i < n; i++) {
+;; double r = a[i];
+;; if (x > r)
+;; x -= r;
+;; if (i == k)
+;; c += n;
+;; }
+;; return x + c;
+;;}
+define double @loop_select_groups(i32 %n, double %x, double* nocapture %a, i32 %k) {
+; CHECK-LABEL: @loop_select_groups(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[PHI_CAST:%.*]] = sitofp i32 [[C_1:%.*]] to double
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[C_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[PHI_CAST]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: [[X_ADDR_0_LCSSA:%.*]] = phi double [ [[X:%.*]], [[ENTRY]] ], [ [[X_ADDR_1:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[ADD5:%.*]] = fadd double [[X_ADDR_0_LCSSA]], [[C_0_LCSSA]]
+; CHECK-NEXT: ret double [[ADD5]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT: [[X_ADDR_022:%.*]] = phi double [ [[X]], [[FOR_BODY_PREHEADER]] ], [ [[X_ADDR_1]], [[SELECT_END]] ]
+; CHECK-NEXT: [[C_020:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[C_1]], [[SELECT_END]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt double [[X_ADDR_022]], [[TMP0]]
+; CHECK-NEXT: [[SUB_FROZEN:%.*]] = freeze i1 [[CMP1]]
+; CHECK-NEXT: br i1 [[SUB_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK: select.false:
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[SUB:%.*]] = phi double [ [[TMP0]], [[FOR_BODY]] ], [ 0.000000e+00, [[SELECT_FALSE]] ]
+; CHECK-NEXT: [[X_ADDR_1]] = fsub double [[X_ADDR_022]], [[SUB]]
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[K:%.*]], [[N]]
+; CHECK-NEXT: [[ADD:%.*]] = select i1 [[CMP2]], i32 [[N]], i32 0
+; CHECK-NEXT: [[C_1]] = add nsw i32 [[ADD]], [[C_020]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp19 = icmp sgt i32 %n, 0
+ br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %phi.cast = sitofp i32 %c.1 to double
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %c.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %phi.cast, %for.cond.cleanup.loopexit ]
+ %x.addr.0.lcssa = phi double [ %x, %entry ], [ %x.addr.1, %for.cond.cleanup.loopexit ]
+ %add5 = fadd double %x.addr.0.lcssa, %c.0.lcssa
+ ret double %add5
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %x.addr.022 = phi double [ %x, %for.body.preheader ], [ %x.addr.1, %for.body ]
+ %c.020 = phi i32 [ 0, %for.body.preheader ], [ %c.1, %for.body ]
+ %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+ %0 = load double, double* %arrayidx, align 8
+ %cmp1 = fcmp ogt double %x.addr.022, %0
+ %sub = select i1 %cmp1, double %0, double 0.000000e+00
+ %x.addr.1 = fsub double %x.addr.022, %sub
+ %1 = trunc i64 %indvars.iv to i32
+ %cmp2 = icmp eq i32 %k, %n
+ %add = select i1 %cmp2, i32 %n, i32 0
+ %c.1 = add nsw i32 %add, %c.020
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
; Function Attrs: nounwind readnone speculatable willreturn
declare void @llvm.dbg.value(metadata, metadata, metadata)
+; Function Attrs: mustprogress nofree nosync nounwind readnone speculatable willreturn
+declare double @llvm.fmuladd.f64(double, double, double)
+
!llvm.module.flags = !{!0, !26, !27}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
@@ -243,3 +535,4 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
!25 = !{}
!26 = !{i32 2, !"Dwarf Version", i32 4}
!27 = !{i32 1, !"Debug Info Version", i32 3}
+!28 = !{!"branch_weights", i32 30, i32 70}
More information about the llvm-commits
mailing list