[llvm] 8b42bc5 - [SelectOpti][3/5] Base Heuristics
Sotiris Apostolakis via llvm-commits
llvm-commits at lists.llvm.org
Mon May 23 19:02:27 PDT 2022
Author: Sotiris Apostolakis
Date: 2022-05-23T22:01:12-04:00
New Revision: 8b42bc5662ca13c97746b7301bd503b2662dc444
URL: https://github.com/llvm/llvm-project/commit/8b42bc5662ca13c97746b7301bd503b2662dc444
DIFF: https://github.com/llvm/llvm-project/commit/8b42bc5662ca13c97746b7301bd503b2662dc444.diff
LOG: [SelectOpti][3/5] Base Heuristics
This patch adds the base heuristics for determining whether branches are more profitable than conditional moves.
Base heuristics apply to all code apart from inner-most loops.
Depends on D122259
Reviewed By: davidxl
Differential Revision: https://reviews.llvm.org/D120231
Added:
Modified:
llvm/lib/CodeGen/SelectOptimize.cpp
llvm/test/CodeGen/X86/select-optimize.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 337d825b1dcd..5cf937335df9 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -15,37 +15,73 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <memory>
+#include <queue>
+#include <stack>
+#include <string>
using namespace llvm;
#define DEBUG_TYPE "select-optimize"
+STATISTIC(NumSelectOptAnalyzed,
+ "Number of select groups considered for conversion to branch");
+STATISTIC(NumSelectConvertedExpColdOperand,
+ "Number of select groups converted due to expensive cold operand");
+STATISTIC(NumSelectConvertedHighPred,
+ "Number of select groups converted due to high-predictability");
+STATISTIC(NumSelectUnPred,
+ "Number of select groups not converted due to unpredictability");
+STATISTIC(NumSelectColdBB,
+ "Number of select groups not converted due to cold basic block");
STATISTIC(NumSelectsConverted, "Number of selects converted");
+static cl::opt<unsigned> ColdOperandThreshold(
+ "cold-operand-threshold",
+ cl::desc("Maximum frequency of path for an operand to be considered cold."),
+ cl::init(20), cl::Hidden);
+
+static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
+ "cold-operand-max-cost-multiplier",
+ cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
+ "slice of a cold operand to be considered inexpensive."),
+ cl::init(1), cl::Hidden);
+
namespace {
class SelectOptimize : public FunctionPass {
const TargetMachine *TM = nullptr;
const TargetSubtargetInfo *TSI;
const TargetLowering *TLI = nullptr;
+ const TargetTransformInfo *TTI = nullptr;
const LoopInfo *LI;
+ DominatorTree *DT;
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
+ ProfileSummaryInfo *PSI;
+ OptimizationRemarkEmitter *ORE;
public:
static char ID;
+
SelectOptimize() : FunctionPass(ID) {
initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
}
@@ -53,8 +89,12 @@ class SelectOptimize : public FunctionPass {
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
AU.addRequired<TargetPassConfig>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}
private:
@@ -63,9 +103,47 @@ class SelectOptimize : public FunctionPass {
using SelectGroup = SmallVector<SelectInst *, 2>;
using SelectGroups = SmallVector<SelectGroup, 2>;
+ // Converts select instructions of a function to conditional jumps when deemed
+ // profitable. Returns true if at least one select was converted.
bool optimizeSelects(Function &F);
+
+ // Heuristics for determining which select instructions can be profitably
+ // conveted to branches. Separate heuristics for selects in inner-most loops
+ // and the rest of code regions (base heuristics for non-inner-most loop
+ // regions).
+ void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
+ void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
+
+ // Converts to branches the select groups that were deemed
+ // profitable-to-convert.
void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
+
+ // Splits selects of a given basic block into select groups.
void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
+
+ // Determines for which select groups it is profitable converting to branches
+ // (base heuristics).
+ void findProfitableSIGroupsBase(SelectGroups &SIGroups,
+ SelectGroups &ProfSIGroups);
+ // Determines if a select group should be converted to a branch (base
+ // heuristics).
+ bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
+
+ // Returns true if there are expensive instructions in the cold value
+ // operand's (if any) dependence slice of any of the selects of the given
+ // group.
+ bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
+
+ // For a given source instruction, collect its backwards dependence slice
+ // consisting of instructions exclusively computed for producing the operands
+ // of the source instruction.
+ void getExclBackwardsSlice(Instruction *I,
+ SmallVector<Instruction *, 2> &Slice);
+
+ // Returns true if the condition of the select is highly predictable.
+ bool isSelectHighlyPredictable(const SelectInst *SI);
+
+ // Returns true if the target architecture supports lowering a given select.
bool isSelectKindSupported(SelectInst *SI);
};
} // namespace
@@ -75,7 +153,11 @@ char SelectOptimize::ID = 0;
INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
false)
@@ -85,27 +167,37 @@ bool SelectOptimize::runOnFunction(Function &F) {
TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
TSI = TM->getSubtargetImpl(F);
TLI = TSI->getTargetLowering();
+
+ // If none of the select types is supported then skip this pass.
+ // This is an optimization pass. Legality issues will be handled by
+ // instruction selection.
+ if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
+ !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
+ !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
+ return false;
+
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
BPI.reset(new BranchProbabilityInfo(F, *LI));
BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
+ PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+ // When optimizing for size, selects are preferable over branches.
+ if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get()))
+ return false;
return optimizeSelects(F);
}
bool SelectOptimize::optimizeSelects(Function &F) {
- // Collect all the select groups.
- SelectGroups SIGroups;
- for (BasicBlock &BB : F) {
- collectSelectGroups(BB, SIGroups);
- }
-
// Determine for which select groups it is profitable converting to branches.
SelectGroups ProfSIGroups;
- // For now assume that all select groups can be profitably converted to
- // branches.
- for (SelectGroup &ASI : SIGroups) {
- ProfSIGroups.push_back(ASI);
- }
+ // Base heuristics apply only to non-loops and outer loops.
+ optimizeSelectsBase(F, ProfSIGroups);
+ // Separate heuristics for inner-most loops.
+ optimizeSelectsInnerLoops(F, ProfSIGroups);
// Convert to branches the select groups that were deemed
// profitable-to-convert.
@@ -115,6 +207,25 @@ bool SelectOptimize::optimizeSelects(Function &F) {
return !ProfSIGroups.empty();
}
+void SelectOptimize::optimizeSelectsBase(Function &F,
+ SelectGroups &ProfSIGroups) {
+ // Collect all the select groups.
+ SelectGroups SIGroups;
+ for (BasicBlock &BB : F) {
+ // Base heuristics apply only to non-loops and outer loops.
+ Loop *L = LI->getLoopFor(&BB);
+ if (L && L->isInnermost())
+ continue;
+ collectSelectGroups(BB, SIGroups);
+ }
+
+ // Determine for which select groups it is profitable converting to branches.
+ findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
+}
+
+void SelectOptimize::optimizeSelectsInnerLoops(Function &F,
+ SelectGroups &ProfSIGroups) {}
+
/// If \p isTrue is true, return the true value of \p SI, otherwise return
/// false value of \p SI. If the true/false value of \p SI is defined by any
/// select instructions in \p Selects, look through the defining select
@@ -256,6 +367,168 @@ void SelectOptimize::collectSelectGroups(BasicBlock &BB,
}
}
+void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups,
+ SelectGroups &ProfSIGroups) {
+ for (SelectGroup &ASI : SIGroups) {
+ ++NumSelectOptAnalyzed;
+ if (isConvertToBranchProfitableBase(ASI))
+ ProfSIGroups.push_back(ASI);
+ }
+}
+
+bool SelectOptimize::isConvertToBranchProfitableBase(
+ const SmallVector<SelectInst *, 2> &ASI) {
+ SelectInst *SI = ASI.front();
+ OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI);
+ OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI);
+
+ // Skip cold basic blocks. Better to optimize for size for cold blocks.
+ if (PSI->isColdBlock(SI->getParent(), BFI.get())) {
+ ++NumSelectColdBB;
+ ORmiss << "Not converted to branch because of cold basic block. ";
+ ORE->emit(ORmiss);
+ return false;
+ }
+
+ // If unpredictable, branch form is less profitable.
+ if (SI->getMetadata(LLVMContext::MD_unpredictable)) {
+ ++NumSelectUnPred;
+ ORmiss << "Not converted to branch because of unpredictable branch. ";
+ ORE->emit(ORmiss);
+ return false;
+ }
+
+ // If highly predictable, branch form is more profitable, unless a
+ // predictable select is inexpensive in the target architecture.
+ if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
+ ++NumSelectConvertedHighPred;
+ OR << "Converted to branch because of highly predictable branch. ";
+ ORE->emit(OR);
+ return true;
+ }
+
+ // Look for expensive instructions in the cold operand's (if any) dependence
+ // slice of any of the selects in the group.
+ if (hasExpensiveColdOperand(ASI)) {
+ ++NumSelectConvertedExpColdOperand;
+ OR << "Converted to branch because of expensive cold operand.";
+ ORE->emit(OR);
+ return true;
+ }
+
+ ORmiss << "Not profitable to convert to branch (base heuristic).";
+ ORE->emit(ORmiss);
+ return false;
+}
+
+static InstructionCost divideNearest(InstructionCost Numerator,
+ uint64_t Denominator) {
+ return (Numerator + (Denominator / 2)) / Denominator;
+}
+
+bool SelectOptimize::hasExpensiveColdOperand(
+ const SmallVector<SelectInst *, 2> &ASI) {
+ bool ColdOperand = false;
+ uint64_t TrueWeight, FalseWeight, TotalWeight;
+ if (ASI.front()->extractProfMetadata(TrueWeight, FalseWeight)) {
+ uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
+ TotalWeight = TrueWeight + FalseWeight;
+ // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
+ ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
+ } else if (PSI->hasProfileSummary()) {
+ OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
+ ORmiss << "Profile data available but missing branch-weights metadata for "
+ "select instruction. ";
+ ORE->emit(ORmiss);
+ }
+ if (!ColdOperand)
+ return false;
+ // Check if the cold path's dependence slice is expensive for any of the
+ // selects of the group.
+ for (SelectInst *SI : ASI) {
+ Instruction *ColdI = nullptr;
+ uint64_t HotWeight;
+ if (TrueWeight < FalseWeight) {
+ ColdI = dyn_cast<Instruction>(SI->getTrueValue());
+ HotWeight = FalseWeight;
+ } else {
+ ColdI = dyn_cast<Instruction>(SI->getFalseValue());
+ HotWeight = TrueWeight;
+ }
+ if (ColdI) {
+ SmallVector<Instruction *, 2> ColdSlice;
+ getExclBackwardsSlice(ColdI, ColdSlice);
+ InstructionCost SliceCost = 0;
+ for (auto *ColdII : ColdSlice) {
+ SliceCost +=
+ TTI->getInstructionCost(ColdII, TargetTransformInfo::TCK_Latency);
+ }
+ // The colder the cold value operand of the select is the more expensive
+ // the cmov becomes for computing the cold value operand every time. Thus,
+ // the colder the cold operand is the more its cost counts.
+ // Get nearest integer cost adjusted for coldness.
+ InstructionCost AdjSliceCost =
+ divideNearest(SliceCost * HotWeight, TotalWeight);
+ if (AdjSliceCost >=
+ ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
+ return true;
+ }
+ }
+ return false;
+}
+
+// For a given source instruction, collect its backwards dependence slice
+// consisting of instructions exclusively computed for the purpose of producing
+// the operands of the source instruction. As an approximation
+// (sufficiently-accurate in practice), we populate this set with the
+// instructions of the backwards dependence slice that only have one-use and
+// form an one-use chain that leads to the source instruction.
+void SelectOptimize::getExclBackwardsSlice(
+ Instruction *I, SmallVector<Instruction *, 2> &Slice) {
+ SmallPtrSet<Instruction *, 2> Visited;
+ std::queue<Instruction *> Worklist;
+ Worklist.push(I);
+ while (!Worklist.empty()) {
+ Instruction *II = Worklist.front();
+ Worklist.pop();
+
+ // Avoid cycles.
+ if (Visited.count(II))
+ continue;
+ Visited.insert(II);
+
+ if (!II->hasOneUse())
+ continue;
+
+ // Avoid considering instructions with less frequency than the source
+ // instruction (i.e., avoid colder code regions of the dependence slice).
+ if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
+ continue;
+
+ // Eligible one-use instruction added to the dependence slice.
+ Slice.push_back(II);
+
+ // Explore all the operands of the current instruction to expand the slice.
+ for (unsigned k = 0; k < II->getNumOperands(); ++k)
+ if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
+ Worklist.push(OpI);
+ }
+}
+
+bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) {
+ uint64_t TrueWeight, FalseWeight;
+ if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
+ uint64_t Max = std::max(TrueWeight, FalseWeight);
+ uint64_t Sum = TrueWeight + FalseWeight;
+ if (Sum != 0) {
+ auto Probability = BranchProbability::getBranchProbability(Max, Sum);
+ if (Probability > TTI->getPredictableBranchThreshold())
+ return true;
+ }
+ }
+ return false;
+}
+
bool SelectOptimize::isSelectKindSupported(SelectInst *SI) {
bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
if (VectorCond)
diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
index 300fb4de312d..99a194f033bb 100644
--- a/llvm/test/CodeGen/X86/select-optimize.ll
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -1,47 +1,133 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=x86_64-unknown-unknown -select-optimize -S < %s | FileCheck %s
-; Single select converted to branch
-define i32 @single_select(i32 %a, i32 %b, i1 %cmp) {
-; CHECK-LABEL: @single_select(
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Test base heuristic 1:
+;; highly-biased selects assumed to be highly predictable, converted to branches
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; If a select is obviously predictable, turn it into a branch.
+define i32 @weighted_select1(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: @weighted_select1(
; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
-; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16:![0-9]+]]
; CHECK: select.false:
; CHECK-NEXT: br label [[SELECT_END]]
; CHECK: select.end:
; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ]
; CHECK-NEXT: ret i32 [[SEL]]
;
- %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
+ ret i32 %sel
+}
+
+; If a select is obviously predictable (reversed profile weights),
+; turn it into a branch.
+define i32 @weighted_select2(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: @weighted_select2(
+; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
+; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF17:![0-9]+]]
+; CHECK: select.false:
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ]
+; CHECK-NEXT: ret i32 [[SEL]]
+;
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
+ ret i32 %sel
+}
+
+; Not obvioulsy predictable select.
+define i32 @weighted_select3(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: @weighted_select3(
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !prof [[PROF18:![0-9]+]]
+; CHECK-NEXT: ret i32 [[SEL]]
+;
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !17
+ ret i32 %sel
+}
+
+; Unpredictable select should not form a branch.
+define i32 @unpred_select(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: @unpred_select(
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !unpredictable !19
+; CHECK-NEXT: ret i32 [[SEL]]
+;
+ %sel = select i1 %cmp, i32 %a, i32 %b, !unpredictable !20
+ ret i32 %sel
+}
+
+; Predictable select in function with optsize attribute should not form branch.
+define i32 @weighted_select_optsize(i32 %a, i32 %b, i1 %cmp) optsize {
+; CHECK-LABEL: @weighted_select_optsize(
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !prof [[PROF16]]
+; CHECK-NEXT: ret i32 [[SEL]]
+;
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
ret i32 %sel
}
-; Select group converted to branch
-define i32 @select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) {
-; CHECK-LABEL: @select_group(
+define i32 @weighted_select_pgso(i32 %a, i32 %b, i1 %cmp) !prof !14 {
+; CHECK-LABEL: @weighted_select_pgso(
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !prof [[PROF16]]
+; CHECK-NEXT: ret i32 [[SEL]]
+;
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
+ ret i32 %sel
+}
+
+; If two selects in a row are predictable, turn them into branches.
+define i32 @weighted_selects(i32 %a, i32 %b) !prof !19 {
+; CHECK-LABEL: @weighted_selects(
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP]]
+; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]]
+; CHECK: select.false:
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[A]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[SEL]], 0
+; CHECK-NEXT: [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP1]]
+; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END1:%.*]], label [[SELECT_FALSE2:%.*]], !prof [[PROF16]]
+; CHECK: select.false2:
+; CHECK-NEXT: br label [[SELECT_END1]]
+; CHECK: select.end1:
+; CHECK-NEXT: [[SEL1:%.*]] = phi i32 [ [[B]], [[SELECT_END]] ], [ [[A]], [[SELECT_FALSE2]] ]
+; CHECK-NEXT: ret i32 [[SEL1]]
+;
+ %cmp = icmp ne i32 %a, 0
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
+ %cmp1 = icmp ne i32 %sel, 0
+ %sel1 = select i1 %cmp1, i32 %b, i32 %a, !prof !15
+ ret i32 %sel1
+}
+
+; If select group predictable, turn it into a branch.
+define i32 @weighted_select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) !prof !19 {
+; CHECK-LABEL: @weighted_select_group(
; CHECK-NEXT: [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
-; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF2]]
+; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]]
; CHECK: select.false:
; CHECK-NEXT: br label [[SELECT_END]]
; CHECK: select.end:
; CHECK-NEXT: [[SEL1:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ]
; CHECK-NEXT: [[SEL2:%.*]] = phi i32 [ [[C:%.*]], [[TMP0]] ], [ [[A]], [[SELECT_FALSE]] ]
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[SEL1]], metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[SEL1]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]]
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL1]], [[SEL2]]
; CHECK-NEXT: ret i32 [[ADD]]
;
- %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !0
- call void @llvm.dbg.value(metadata i32 %sel1, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !3)
- %sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !0
+ %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !15
+ call void @llvm.dbg.value(metadata i32 %sel1, metadata !24, metadata !DIExpression()), !dbg !DILocation(scope: !23)
+ %sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !15
%add = add i32 %sel1, %sel2
ret i32 %add
}
-; Select group with intra-group dependence converted to branch
+; Predictable select group with intra-group dependence converted to branch
define i32 @select_group_intra_group(i32 %a, i32 %b, i32 %c, i1 %cmp) {
; CHECK-LABEL: @select_group_intra_group(
; CHECK-NEXT: [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
-; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF2]]
+; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]]
; CHECK: select.false:
; CHECK-NEXT: br label [[SELECT_END]]
; CHECK: select.end:
@@ -50,22 +136,110 @@ define i32 @select_group_intra_group(i32 %a, i32 %b, i32 %c, i1 %cmp) {
; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[SEL1]], [[SEL2]]
; CHECK-NEXT: ret i32 [[SUB]]
;
- %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !0
- %sel2 = select i1 %cmp, i32 %c, i32 %sel1, !prof !0
+ %sel1 = select i1 %cmp, i32 %a, i32 %b,!prof !15
+ %sel2 = select i1 %cmp, i32 %c, i32 %sel1, !prof !15
%sub = sub i32 %sel1, %sel2
ret i32 %sub
}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Test base heuristic 2:
+;; look for expensive instructions in the one-use slice of the cold path
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Select with cold one-use load value operand should form branch and
+; sink load
+define i32 @expensive_val_operand1(i32* nocapture %a, i32 %y, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand1(
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
+; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
+; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]]
+; CHECK: select.false:
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[LOAD]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ]
+; CHECK-NEXT: ret i32 [[SEL]]
+;
+ %load = load i32, i32* %a, align 8
+ %sel = select i1 %cmp, i32 %load, i32 %y, !prof !17
+ ret i32 %sel
+}
+
+; Expensive hot value operand and cheap cold value operand.
+define i32 @expensive_val_operand2(i32* nocapture %a, i32 %x, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand2(
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[X:%.*]], i32 [[LOAD]], !prof [[PROF18]]
+; CHECK-NEXT: ret i32 [[SEL]]
+;
+ %load = load i32, i32* %a, align 8
+ %sel = select i1 %cmp, i32 %x, i32 %load, !prof !17
+ ret i32 %sel
+}
+
+; Cold value operand with load in its one-use dependence slice shoud result
+; into a branch with sinked dependence slice.
+define i32 @expensive_val_operand3(i32* nocapture %a, i32 %b, i32 %y, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand3(
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
+; CHECK-NEXT: [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]]
+; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
+; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]]
+; CHECK: select.false:
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ]
+; CHECK-NEXT: ret i32 [[SEL]]
+;
+ %load = load i32, i32* %a, align 8
+ %x = add i32 %load, %b
+ %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17
+ ret i32 %sel
+}
+
+; Multiple uses of the load value operand.
+define i32 @expensive_val_operand4(i32 %a, i32* nocapture %b, i32 %x, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand4(
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[B:%.*]], align 4
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[X:%.*]], i32 [[LOAD]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL]], [[LOAD]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %load = load i32, i32* %b, align 4
+ %sel = select i1 %cmp, i32 %x, i32 %load
+ %add = add i32 %sel, %load
+ ret i32 %add
+}
+
; Function Attrs: nounwind readnone speculatable willreturn
declare void @llvm.dbg.value(metadata, metadata, metadata)
-!llvm.module.flags = !{!6, !7}
-
-!0 = !{!"branch_weights", i32 1, i32 100}
-!1 = !DIFile(filename: "test.c", directory: "/test")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 15.0.0", isOptimized: true, emissionKind: FullDebug, globals: !5, splitDebugInlining: false, nameTableKind: None)
-!3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, unit: !2)
-!4 = !DILocalVariable(name: "x", scope: !3)
-!5 = !{}
-!6 = !{i32 2, !"Dwarf Version", i32 4}
-!7 = !{i32 1, !"Debug Info Version", i32 3}
+!llvm.module.flags = !{!0, !26, !27}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
+!15 = !{!"branch_weights", i32 1, i32 100}
+!16 = !{!"branch_weights", i32 100, i32 1}
+!17 = !{!"branch_weights", i32 1, i32 99}
+!18 = !{!"branch_weights", i32 50, i32 50}
+!19 = !{!"function_entry_count", i64 100}
+!20 = !{}
+!21 = !DIFile(filename: "test.c", directory: "/test")
+!22 = distinct !DICompileUnit(language: DW_LANG_C99, file: !21, producer: "clang version 15.0.0", isOptimized: true, emissionKind: FullDebug, globals: !25, splitDebugInlining: false, nameTableKind: None)
+!23 = distinct !DISubprogram(name: "test", scope: !21, file: !21, line: 1, unit: !22)
+!24 = !DILocalVariable(name: "x", scope: !23)
+!25 = !{}
+!26 = !{i32 2, !"Dwarf Version", i32 4}
+!27 = !{i32 1, !"Debug Info Version", i32 3}
More information about the llvm-commits
mailing list