[llvm] 8b42bc5 - [SelectOpti][3/5] Base Heuristics

Sotiris Apostolakis via llvm-commits llvm-commits at lists.llvm.org
Mon May 23 19:02:27 PDT 2022


Author: Sotiris Apostolakis
Date: 2022-05-23T22:01:12-04:00
New Revision: 8b42bc5662ca13c97746b7301bd503b2662dc444

URL: https://github.com/llvm/llvm-project/commit/8b42bc5662ca13c97746b7301bd503b2662dc444
DIFF: https://github.com/llvm/llvm-project/commit/8b42bc5662ca13c97746b7301bd503b2662dc444.diff

LOG: [SelectOpti][3/5] Base Heuristics

This patch adds the base heuristics for determining whether branches are more profitable than conditional moves.
Base heuristics apply to all code apart from inner-most loops.

Depends on D122259

Reviewed By: davidxl

Differential Revision: https://reviews.llvm.org/D120231

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectOptimize.cpp
    llvm/test/CodeGen/X86/select-optimize.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 337d825b1dcd..5cf937335df9 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -15,37 +15,73 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <memory>
+#include <queue>
+#include <stack>
+#include <string>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "select-optimize"
 
+STATISTIC(NumSelectOptAnalyzed,
+          "Number of select groups considered for conversion to branch");
+STATISTIC(NumSelectConvertedExpColdOperand,
+          "Number of select groups converted due to expensive cold operand");
+STATISTIC(NumSelectConvertedHighPred,
+          "Number of select groups converted due to high-predictability");
+STATISTIC(NumSelectUnPred,
+          "Number of select groups not converted due to unpredictability");
+STATISTIC(NumSelectColdBB,
+          "Number of select groups not converted due to cold basic block");
 STATISTIC(NumSelectsConverted, "Number of selects converted");
 
+static cl::opt<unsigned> ColdOperandThreshold(
+    "cold-operand-threshold",
+    cl::desc("Maximum frequency of path for an operand to be considered cold."),
+    cl::init(20), cl::Hidden);
+
+static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
+    "cold-operand-max-cost-multiplier",
+    cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
+             "slice of a cold operand to be considered inexpensive."),
+    cl::init(1), cl::Hidden);
+
 namespace {
 
 class SelectOptimize : public FunctionPass {
   const TargetMachine *TM = nullptr;
   const TargetSubtargetInfo *TSI;
   const TargetLowering *TLI = nullptr;
+  const TargetTransformInfo *TTI = nullptr;
   const LoopInfo *LI;
+  DominatorTree *DT;
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
+  ProfileSummaryInfo *PSI;
+  OptimizationRemarkEmitter *ORE;
 
 public:
   static char ID;
+
   SelectOptimize() : FunctionPass(ID) {
     initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
   }
@@ -53,8 +89,12 @@ class SelectOptimize : public FunctionPass {
   bool runOnFunction(Function &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
     AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
 private:
@@ -63,9 +103,47 @@ class SelectOptimize : public FunctionPass {
   using SelectGroup = SmallVector<SelectInst *, 2>;
   using SelectGroups = SmallVector<SelectGroup, 2>;
 
+  // Converts select instructions of a function to conditional jumps when deemed
+  // profitable. Returns true if at least one select was converted.
   bool optimizeSelects(Function &F);
+
+  // Heuristics for determining which select instructions can be profitably
+  // conveted to branches. Separate heuristics for selects in inner-most loops
+  // and the rest of code regions (base heuristics for non-inner-most loop
+  // regions).
+  void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
+  void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
+
+  // Converts to branches the select groups that were deemed
+  // profitable-to-convert.
   void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
+
+  // Splits selects of a given basic block into select groups.
   void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
+
+  // Determines for which select groups it is profitable converting to branches
+  // (base heuristics).
+  void findProfitableSIGroupsBase(SelectGroups &SIGroups,
+                                  SelectGroups &ProfSIGroups);
+  // Determines if a select group should be converted to a branch (base
+  // heuristics).
+  bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
+
+  // Returns true if there are expensive instructions in the cold value
+  // operand's (if any) dependence slice of any of the selects of the given
+  // group.
+  bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
+
+  // For a given source instruction, collect its backwards dependence slice
+  // consisting of instructions exclusively computed for producing the operands
+  // of the source instruction.
+  void getExclBackwardsSlice(Instruction *I,
+                             SmallVector<Instruction *, 2> &Slice);
+
+  // Returns true if the condition of the select is highly predictable.
+  bool isSelectHighlyPredictable(const SelectInst *SI);
+
+  // Returns true if the target architecture supports lowering a given select.
   bool isSelectKindSupported(SelectInst *SI);
 };
 } // namespace
@@ -75,7 +153,11 @@ char SelectOptimize::ID = 0;
 INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
                     false)
 
@@ -85,27 +167,37 @@ bool SelectOptimize::runOnFunction(Function &F) {
   TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
   TSI = TM->getSubtargetImpl(F);
   TLI = TSI->getTargetLowering();
+
+  // If none of the select types is supported then skip this pass.
+  // This is an optimization pass. Legality issues will be handled by
+  // instruction selection.
+  if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
+      !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
+      !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
+    return false;
+
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   BPI.reset(new BranchProbabilityInfo(F, *LI));
   BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+  // When optimizing for size, selects are preferable over branches.
+  if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get()))
+    return false;
 
   return optimizeSelects(F);
 }
 
 bool SelectOptimize::optimizeSelects(Function &F) {
-  // Collect all the select groups.
-  SelectGroups SIGroups;
-  for (BasicBlock &BB : F) {
-    collectSelectGroups(BB, SIGroups);
-  }
-
   // Determine for which select groups it is profitable converting to branches.
   SelectGroups ProfSIGroups;
-  // For now assume that all select groups can be profitably converted to
-  // branches.
-  for (SelectGroup &ASI : SIGroups) {
-    ProfSIGroups.push_back(ASI);
-  }
+  // Base heuristics apply only to non-loops and outer loops.
+  optimizeSelectsBase(F, ProfSIGroups);
+  // Separate heuristics for inner-most loops.
+  optimizeSelectsInnerLoops(F, ProfSIGroups);
 
   // Convert to branches the select groups that were deemed
   // profitable-to-convert.
@@ -115,6 +207,25 @@ bool SelectOptimize::optimizeSelects(Function &F) {
   return !ProfSIGroups.empty();
 }
 
+void SelectOptimize::optimizeSelectsBase(Function &F,
+                                         SelectGroups &ProfSIGroups) {
+  // Collect all the select groups.
+  SelectGroups SIGroups;
+  for (BasicBlock &BB : F) {
+    // Base heuristics apply only to non-loops and outer loops.
+    Loop *L = LI->getLoopFor(&BB);
+    if (L && L->isInnermost())
+      continue;
+    collectSelectGroups(BB, SIGroups);
+  }
+
+  // Determine for which select groups it is profitable converting to branches.
+  findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
+}
+
+void SelectOptimize::optimizeSelectsInnerLoops(Function &F,
+                                               SelectGroups &ProfSIGroups) {}
+
 /// If \p isTrue is true, return the true value of \p SI, otherwise return
 /// false value of \p SI. If the true/false value of \p SI is defined by any
 /// select instructions in \p Selects, look through the defining select
@@ -256,6 +367,168 @@ void SelectOptimize::collectSelectGroups(BasicBlock &BB,
   }
 }
 
+void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups,
+                                                SelectGroups &ProfSIGroups) {
+  for (SelectGroup &ASI : SIGroups) {
+    ++NumSelectOptAnalyzed;
+    if (isConvertToBranchProfitableBase(ASI))
+      ProfSIGroups.push_back(ASI);
+  }
+}
+
+bool SelectOptimize::isConvertToBranchProfitableBase(
+    const SmallVector<SelectInst *, 2> &ASI) {
+  SelectInst *SI = ASI.front();
+  OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI);
+  OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI);
+
+  // Skip cold basic blocks. Better to optimize for size for cold blocks.
+  if (PSI->isColdBlock(SI->getParent(), BFI.get())) {
+    ++NumSelectColdBB;
+    ORmiss << "Not converted to branch because of cold basic block. ";
+    ORE->emit(ORmiss);
+    return false;
+  }
+
+  // If unpredictable, branch form is less profitable.
+  if (SI->getMetadata(LLVMContext::MD_unpredictable)) {
+    ++NumSelectUnPred;
+    ORmiss << "Not converted to branch because of unpredictable branch. ";
+    ORE->emit(ORmiss);
+    return false;
+  }
+
+  // If highly predictable, branch form is more profitable, unless a
+  // predictable select is inexpensive in the target architecture.
+  if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
+    ++NumSelectConvertedHighPred;
+    OR << "Converted to branch because of highly predictable branch. ";
+    ORE->emit(OR);
+    return true;
+  }
+
+  // Look for expensive instructions in the cold operand's (if any) dependence
+  // slice of any of the selects in the group.
+  if (hasExpensiveColdOperand(ASI)) {
+    ++NumSelectConvertedExpColdOperand;
+    OR << "Converted to branch because of expensive cold operand.";
+    ORE->emit(OR);
+    return true;
+  }
+
+  ORmiss << "Not profitable to convert to branch (base heuristic).";
+  ORE->emit(ORmiss);
+  return false;
+}
+
+static InstructionCost divideNearest(InstructionCost Numerator,
+                                     uint64_t Denominator) {
+  return (Numerator + (Denominator / 2)) / Denominator;
+}
+
+bool SelectOptimize::hasExpensiveColdOperand(
+    const SmallVector<SelectInst *, 2> &ASI) {
+  bool ColdOperand = false;
+  uint64_t TrueWeight, FalseWeight, TotalWeight;
+  if (ASI.front()->extractProfMetadata(TrueWeight, FalseWeight)) {
+    uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
+    TotalWeight = TrueWeight + FalseWeight;
+    // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
+    ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
+  } else if (PSI->hasProfileSummary()) {
+    OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
+    ORmiss << "Profile data available but missing branch-weights metadata for "
+              "select instruction. ";
+    ORE->emit(ORmiss);
+  }
+  if (!ColdOperand)
+    return false;
+  // Check if the cold path's dependence slice is expensive for any of the
+  // selects of the group.
+  for (SelectInst *SI : ASI) {
+    Instruction *ColdI = nullptr;
+    uint64_t HotWeight;
+    if (TrueWeight < FalseWeight) {
+      ColdI = dyn_cast<Instruction>(SI->getTrueValue());
+      HotWeight = FalseWeight;
+    } else {
+      ColdI = dyn_cast<Instruction>(SI->getFalseValue());
+      HotWeight = TrueWeight;
+    }
+    if (ColdI) {
+      SmallVector<Instruction *, 2> ColdSlice;
+      getExclBackwardsSlice(ColdI, ColdSlice);
+      InstructionCost SliceCost = 0;
+      for (auto *ColdII : ColdSlice) {
+        SliceCost +=
+            TTI->getInstructionCost(ColdII, TargetTransformInfo::TCK_Latency);
+      }
+      // The colder the cold value operand of the select is the more expensive
+      // the cmov becomes for computing the cold value operand every time. Thus,
+      // the colder the cold operand is the more its cost counts.
+      // Get nearest integer cost adjusted for coldness.
+      InstructionCost AdjSliceCost =
+          divideNearest(SliceCost * HotWeight, TotalWeight);
+      if (AdjSliceCost >=
+          ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
+        return true;
+    }
+  }
+  return false;
+}
+
+// For a given source instruction, collect its backwards dependence slice
+// consisting of instructions exclusively computed for the purpose of producing
+// the operands of the source instruction. As an approximation
+// (sufficiently-accurate in practice), we populate this set with the
+// instructions of the backwards dependence slice that only have one-use and
+// form an one-use chain that leads to the source instruction.
+void SelectOptimize::getExclBackwardsSlice(
+    Instruction *I, SmallVector<Instruction *, 2> &Slice) {
+  SmallPtrSet<Instruction *, 2> Visited;
+  std::queue<Instruction *> Worklist;
+  Worklist.push(I);
+  while (!Worklist.empty()) {
+    Instruction *II = Worklist.front();
+    Worklist.pop();
+
+    // Avoid cycles.
+    if (Visited.count(II))
+      continue;
+    Visited.insert(II);
+
+    if (!II->hasOneUse())
+      continue;
+
+    // Avoid considering instructions with less frequency than the source
+    // instruction (i.e., avoid colder code regions of the dependence slice).
+    if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
+      continue;
+
+    // Eligible one-use instruction added to the dependence slice.
+    Slice.push_back(II);
+
+    // Explore all the operands of the current instruction to expand the slice.
+    for (unsigned k = 0; k < II->getNumOperands(); ++k)
+      if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
+        Worklist.push(OpI);
+  }
+}
+
+bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) {
+  uint64_t TrueWeight, FalseWeight;
+  if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
+    uint64_t Max = std::max(TrueWeight, FalseWeight);
+    uint64_t Sum = TrueWeight + FalseWeight;
+    if (Sum != 0) {
+      auto Probability = BranchProbability::getBranchProbability(Max, Sum);
+      if (Probability > TTI->getPredictableBranchThreshold())
+        return true;
+    }
+  }
+  return false;
+}
+
 bool SelectOptimize::isSelectKindSupported(SelectInst *SI) {
   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
   if (VectorCond)

diff  --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
index 300fb4de312d..99a194f033bb 100644
--- a/llvm/test/CodeGen/X86/select-optimize.ll
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -1,47 +1,133 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple=x86_64-unknown-unknown -select-optimize -S < %s | FileCheck %s
 
-; Single select converted to branch
-define i32 @single_select(i32 %a, i32 %b, i1 %cmp) {
-; CHECK-LABEL: @single_select(
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Test base heuristic 1:
+;; highly-biased selects assumed to be highly predictable, converted to branches
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; If a select is obviously predictable, turn it into a branch.
+define i32 @weighted_select1(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: @weighted_select1(
 ; CHECK-NEXT:    [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
-; CHECK-NEXT:    br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16:![0-9]+]]
 ; CHECK:       select.false:
 ; CHECK-NEXT:    br label [[SELECT_END]]
 ; CHECK:       select.end:
 ; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
+  ret i32 %sel
+}
+
+; If a select is obviously predictable (reversed profile weights),
+; turn it into a branch.
+define i32 @weighted_select2(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: @weighted_select2(
+; CHECK-NEXT:    [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
+; CHECK-NEXT:    br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF17:![0-9]+]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
+  ret i32 %sel
+}
+
+; Not obvioulsy predictable select.
+define i32 @weighted_select3(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: @weighted_select3(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !prof [[PROF18:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !17
+  ret i32 %sel
+}
+
+; Unpredictable select should not form a branch.
+define i32 @unpred_select(i32 %a, i32 %b, i1 %cmp) {
+; CHECK-LABEL: @unpred_select(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !unpredictable !19
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %sel = select i1 %cmp, i32 %a, i32 %b, !unpredictable !20
+  ret i32 %sel
+}
+
+; Predictable select in function with optsize attribute should not form branch.
+define i32 @weighted_select_optsize(i32 %a, i32 %b, i1 %cmp) optsize {
+; CHECK-LABEL: @weighted_select_optsize(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !prof [[PROF16]]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
   ret i32 %sel
 }
 
-; Select group converted to branch
-define i32 @select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) {
-; CHECK-LABEL: @select_group(
+define i32 @weighted_select_pgso(i32 %a, i32 %b, i1 %cmp) !prof !14 {
+; CHECK-LABEL: @weighted_select_pgso(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !prof [[PROF16]]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
+  ret i32 %sel
+}
+
+; If two selects in a row are predictable, turn them into branches.
+define i32 @weighted_selects(i32 %a, i32 %b) !prof !19 {
+; CHECK-LABEL: @weighted_selects(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEL_FROZEN:%.*]] = freeze i1 [[CMP]]
+; CHECK-NEXT:    br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[A]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[SEL]], 0
+; CHECK-NEXT:    [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP1]]
+; CHECK-NEXT:    br i1 [[SEL1_FROZEN]], label [[SELECT_END1:%.*]], label [[SELECT_FALSE2:%.*]], !prof [[PROF16]]
+; CHECK:       select.false2:
+; CHECK-NEXT:    br label [[SELECT_END1]]
+; CHECK:       select.end1:
+; CHECK-NEXT:    [[SEL1:%.*]] = phi i32 [ [[B]], [[SELECT_END]] ], [ [[A]], [[SELECT_FALSE2]] ]
+; CHECK-NEXT:    ret i32 [[SEL1]]
+;
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
+  %cmp1 = icmp ne i32 %sel, 0
+  %sel1 = select i1 %cmp1, i32 %b, i32 %a, !prof !15
+  ret i32 %sel1
+}
+
+; If select group predictable, turn it into a branch.
+define i32 @weighted_select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) !prof !19 {
+; CHECK-LABEL: @weighted_select_group(
 ; CHECK-NEXT:    [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
-; CHECK-NEXT:    br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]]
 ; CHECK:       select.false:
 ; CHECK-NEXT:    br label [[SELECT_END]]
 ; CHECK:       select.end:
 ; CHECK-NEXT:    [[SEL1:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ]
 ; CHECK-NEXT:    [[SEL2:%.*]] = phi i32 [ [[C:%.*]], [[TMP0]] ], [ [[A]], [[SELECT_FALSE]] ]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[SEL1]], metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[SEL1]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SEL1]], [[SEL2]]
 ; CHECK-NEXT:    ret i32 [[ADD]]
 ;
-  %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !0
-  call void @llvm.dbg.value(metadata i32 %sel1, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !3)
-  %sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !0
+  %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !15
+  call void @llvm.dbg.value(metadata i32 %sel1, metadata !24, metadata !DIExpression()), !dbg !DILocation(scope: !23)
+  %sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !15
   %add = add i32 %sel1, %sel2
   ret i32 %add
 }
 
-; Select group with intra-group dependence converted to branch
+; Predictable select group with intra-group dependence converted to branch
 define i32 @select_group_intra_group(i32 %a, i32 %b, i32 %c, i1 %cmp) {
 ; CHECK-LABEL: @select_group_intra_group(
 ; CHECK-NEXT:    [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
-; CHECK-NEXT:    br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]]
 ; CHECK:       select.false:
 ; CHECK-NEXT:    br label [[SELECT_END]]
 ; CHECK:       select.end:
@@ -50,22 +136,110 @@ define i32 @select_group_intra_group(i32 %a, i32 %b, i32 %c, i1 %cmp) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[SEL1]], [[SEL2]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
-  %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !0
-  %sel2 = select i1 %cmp, i32 %c, i32 %sel1, !prof !0
+  %sel1 = select i1 %cmp, i32 %a, i32 %b,!prof !15
+  %sel2 = select i1 %cmp, i32 %c, i32 %sel1, !prof !15
   %sub = sub i32 %sel1, %sel2
   ret i32 %sub
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Test base heuristic 2:
+;; look for expensive instructions in the one-use slice of the cold path
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Select with cold one-use load value operand should form branch and
+; sink load
+define i32 @expensive_val_operand1(i32* nocapture %a, i32 %y, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand1(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
+; CHECK-NEXT:    [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
+; CHECK-NEXT:    br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[LOAD]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %load = load i32, i32* %a, align 8
+  %sel = select i1 %cmp, i32 %load, i32 %y, !prof !17
+  ret i32 %sel
+}
+
+; Expensive hot value operand and cheap cold value operand.
+define i32 @expensive_val_operand2(i32* nocapture %a, i32 %x, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand2(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[X:%.*]], i32 [[LOAD]], !prof [[PROF18]]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %load = load i32, i32* %a, align 8
+  %sel = select i1 %cmp, i32 %x, i32 %load, !prof !17
+  ret i32 %sel
+}
+
+; Cold value operand with load in its one-use dependence slice shoud result
+; into a branch with sinked dependence slice.
+define i32 @expensive_val_operand3(i32* nocapture %a, i32 %b, i32 %y, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand3(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]]
+; CHECK-NEXT:    [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
+; CHECK-NEXT:    br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[X]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %load = load i32, i32* %a, align 8
+  %x = add i32 %load, %b
+  %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17
+  ret i32 %sel
+}
+
+; Multiple uses of the load value operand.
+define i32 @expensive_val_operand4(i32 %a, i32* nocapture %b, i32 %x, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand4(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[B:%.*]], align 4
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[X:%.*]], i32 [[LOAD]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SEL]], [[LOAD]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %load = load i32, i32* %b, align 4
+  %sel = select i1 %cmp, i32 %x, i32 %load
+  %add = add i32 %sel, %load
+  ret i32 %add
+}
+
 ; Function Attrs: nounwind readnone speculatable willreturn
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
-!llvm.module.flags = !{!6, !7}
-
-!0 = !{!"branch_weights", i32 1, i32 100}
-!1 = !DIFile(filename: "test.c", directory: "/test")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 15.0.0", isOptimized: true, emissionKind: FullDebug, globals: !5, splitDebugInlining: false, nameTableKind: None)
-!3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, unit: !2)
-!4 = !DILocalVariable(name: "x", scope: !3)
-!5 = !{}
-!6 = !{i32 2, !"Dwarf Version", i32 4}
-!7 = !{i32 1, !"Debug Info Version", i32 3}
+!llvm.module.flags = !{!0, !26, !27}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
+!15 = !{!"branch_weights", i32 1, i32 100}
+!16 = !{!"branch_weights", i32 100, i32 1}
+!17 = !{!"branch_weights", i32 1, i32 99}
+!18 = !{!"branch_weights", i32 50, i32 50}
+!19 = !{!"function_entry_count", i64 100}
+!20 = !{}
+!21 = !DIFile(filename: "test.c", directory: "/test")
+!22 = distinct !DICompileUnit(language: DW_LANG_C99, file: !21, producer: "clang version 15.0.0", isOptimized: true, emissionKind: FullDebug, globals: !25, splitDebugInlining: false, nameTableKind: None)
+!23 = distinct !DISubprogram(name: "test", scope: !21, file: !21, line: 1, unit: !22)
+!24 = !DILocalVariable(name: "x", scope: !23)
+!25 = !{}
+!26 = !{i32 2, !"Dwarf Version", i32 4}
+!27 = !{i32 1, !"Debug Info Version", i32 3}


        


More information about the llvm-commits mailing list