[llvm] 67be40d - Recommit "[SelectOpti][5/5] Optimize select-to-branch transformation"
Sotiris Apostolakis via llvm-commits
llvm-commits at lists.llvm.org
Tue May 24 11:11:25 PDT 2022
Author: Sotiris Apostolakis
Date: 2022-05-24T14:08:09-04:00
New Revision: 67be40df6eefe375a0c97341940922f0ef7775a4
URL: https://github.com/llvm/llvm-project/commit/67be40df6eefe375a0c97341940922f0ef7775a4
DIFF: https://github.com/llvm/llvm-project/commit/67be40df6eefe375a0c97341940922f0ef7775a4.diff
LOG: Recommit "[SelectOpti][5/5] Optimize select-to-branch transformation"
Use container::size_type directly to avoid type mismatch causing build failures in Windows.
Original commit message:
This patch optimizes the transformation of selects to a branch when the heuristics deemed it profitable.
It aggressively sinks eligible instructions to the newly created true/false blocks to prevent their
execution on the common path and interleaves dependence slices to maximize ILP.
Depends on D120232
Reviewed By: davidxl
Differential Revision: https://reviews.llvm.org/D120233
Added:
Modified:
llvm/lib/CodeGen/SelectOptimize.cpp
llvm/test/CodeGen/X86/select-optimize.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 2194da77e0b4c..9c786b9a5588e 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -179,8 +179,8 @@ class SelectOptimize : public FunctionPass {
// For a given source instruction, collect its backwards dependence slice
// consisting of instructions exclusively computed for producing the operands
// of the source instruction.
- void getExclBackwardsSlice(Instruction *I,
- SmallVector<Instruction *, 2> &Slice);
+ void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
+ bool ForSinking = false);
// Returns true if the condition of the select is highly predictable.
bool isSelectHighlyPredictable(const SelectInst *SI);
@@ -329,6 +329,10 @@ getTrueOrFalseValue(SelectInst *SI, bool isTrue,
void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
for (SelectGroup &ASI : ProfSIGroups) {
+ // The code transformation here is a modified version of the sinking
+ // transformation in CodeGenPrepare::optimizeSelectInst with a more
+ // aggressive strategy of which instructions to sink.
+ //
// TODO: eliminate the redundancy of logic transforming selects to branches
// by removing CodeGenPrepare::optimizeSelectInst and optimizing here
// selects for all cases (with and without profile information).
@@ -342,13 +346,73 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
// start:
// %cmp = cmp uge i32 %a, %b
// %cmp.frozen = freeze %cmp
- // br i1 %cmp.frozen, label %select.end, label %select.false
+ // br i1 %cmp.frozen, label %select.true, label %select.false
+ // select.true:
+ // br label %select.end
// select.false:
// br label %select.end
// select.end:
- // %sel = phi i32 [ %c, %start ], [ %d, %select.false ]
+ // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
//
// %cmp should be frozen, otherwise it may introduce undefined behavior.
+ // In addition, we may sink instructions that produce %c or %d into the
+ // destination(s) of the new branch.
+ // If the true or false blocks do not contain a sunken instruction, that
+ // block and its branch may be optimized away. In that case, one side of the
+ // first branch will point directly to select.end, and the corresponding PHI
+ // predecessor block will be the start block.
+
+ // Find all the instructions that can be soundly sunk to the true/false
+ // blocks. These are instructions that are computed solely for producing the
+ // operands of the select instructions in the group and can be sunk without
+ // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
+ // with side effects).
+ SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
+ typedef std::stack<Instruction *>::size_type StackSizeType;
+ StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
+ for (SelectInst *SI : ASI) {
+ // For each select, compute the sinkable dependence chains of the true and
+ // false operands.
+ if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
+ std::stack<Instruction *> TrueSlice;
+ getExclBackwardsSlice(TI, TrueSlice, true);
+ maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
+ TrueSlices.push_back(TrueSlice);
+ }
+ if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
+ std::stack<Instruction *> FalseSlice;
+ getExclBackwardsSlice(FI, FalseSlice, true);
+ maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
+ FalseSlices.push_back(FalseSlice);
+ }
+ }
+ // In the case of multiple select instructions in the same group, the order
+ // of non-dependent instructions (instructions of
diff erent dependence
+ // slices) in the true/false blocks appears to affect performance.
+ // Interleaving the slices seems to experimentally be the optimal approach.
+ // This interleaving scheduling allows for more ILP (with a natural downside
+ // of increasing a bit register pressure) compared to a simple ordering of
+ // one whole chain after another. One would expect that this ordering would
+ // not matter since the scheduling in the backend of the compiler would
+ // take care of it, but apparently the scheduler fails to deliver optimal
+ // ILP with a naive ordering here.
+ SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
+ for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
+ for (auto &S : TrueSlices) {
+ if (!S.empty()) {
+ TrueSlicesInterleaved.push_back(S.top());
+ S.pop();
+ }
+ }
+ }
+ for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
+ for (auto &S : FalseSlices) {
+ if (!S.empty()) {
+ FalseSlicesInterleaved.push_back(S.top());
+ S.pop();
+ }
+ }
+ }
// We split the block containing the select(s) into two blocks.
SelectInst *SI = ASI.front();
@@ -374,24 +438,55 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
}
// These are the new basic blocks for the conditional branch.
- // For now, no instruction sinking to the true/false blocks.
- // Thus both True and False blocks will be empty.
+ // At least one will become an actual new basic block.
BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
-
- // Use the 'false' side for a new input value to the PHI.
- FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
- EndBlock->getParent(), EndBlock);
- auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
- FalseBranch->setDebugLoc(SI->getDebugLoc());
-
- // For the 'true' side the path originates from the start block from the
- // point view of the new PHI.
- TrueBlock = StartBlock;
+ BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
+ if (!TrueSlicesInterleaved.empty()) {
+ TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
+ EndBlock->getParent(), EndBlock);
+ TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
+ TrueBranch->setDebugLoc(LastSI->getDebugLoc());
+ for (Instruction *TrueInst : TrueSlicesInterleaved)
+ TrueInst->moveBefore(TrueBranch);
+ }
+ if (!FalseSlicesInterleaved.empty()) {
+ FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
+ EndBlock->getParent(), EndBlock);
+ FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
+ FalseBranch->setDebugLoc(LastSI->getDebugLoc());
+ for (Instruction *FalseInst : FalseSlicesInterleaved)
+ FalseInst->moveBefore(FalseBranch);
+ }
+ // If there was nothing to sink, then arbitrarily choose the 'false' side
+ // for a new input value to the PHI.
+ if (TrueBlock == FalseBlock) {
+ assert(TrueBlock == nullptr &&
+ "Unexpected basic block transform while optimizing select");
+
+ FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
+ EndBlock->getParent(), EndBlock);
+ auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
+ FalseBranch->setDebugLoc(SI->getDebugLoc());
+ }
// Insert the real conditional branch based on the original condition.
+ // If we did not create a new block for one of the 'true' or 'false' paths
+ // of the condition, it means that side of the branch goes to the end block
+ // directly and the path originates from the start block from the point of
+ // view of the new PHI.
BasicBlock *TT, *FT;
- TT = EndBlock;
- FT = FalseBlock;
+ if (TrueBlock == nullptr) {
+ TT = EndBlock;
+ FT = FalseBlock;
+ TrueBlock = StartBlock;
+ } else if (FalseBlock == nullptr) {
+ TT = TrueBlock;
+ FT = EndBlock;
+ FalseBlock = StartBlock;
+ } else {
+ TT = TrueBlock;
+ FT = FalseBlock;
+ }
IRBuilder<> IB(SI);
auto *CondFr =
IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
@@ -586,12 +681,13 @@ bool SelectOptimize::hasExpensiveColdOperand(
HotWeight = TrueWeight;
}
if (ColdI) {
- SmallVector<Instruction *, 2> ColdSlice;
+ std::stack<Instruction *> ColdSlice;
getExclBackwardsSlice(ColdI, ColdSlice);
InstructionCost SliceCost = 0;
- for (auto *ColdII : ColdSlice) {
- SliceCost +=
- TTI->getInstructionCost(ColdII, TargetTransformInfo::TCK_Latency);
+ while (!ColdSlice.empty()) {
+ SliceCost += TTI->getInstructionCost(ColdSlice.top(),
+ TargetTransformInfo::TCK_Latency);
+ ColdSlice.pop();
}
// The colder the cold value operand of the select is the more expensive
// the cmov becomes for computing the cold value operand every time. Thus,
@@ -613,8 +709,9 @@ bool SelectOptimize::hasExpensiveColdOperand(
// (sufficiently-accurate in practice), we populate this set with the
// instructions of the backwards dependence slice that only have one-use and
// form an one-use chain that leads to the source instruction.
-void SelectOptimize::getExclBackwardsSlice(
- Instruction *I, SmallVector<Instruction *, 2> &Slice) {
+void SelectOptimize::getExclBackwardsSlice(Instruction *I,
+ std::stack<Instruction *> &Slice,
+ bool ForSinking) {
SmallPtrSet<Instruction *, 2> Visited;
std::queue<Instruction *> Worklist;
Worklist.push(I);
@@ -630,13 +727,20 @@ void SelectOptimize::getExclBackwardsSlice(
if (!II->hasOneUse())
continue;
+ // Cannot soundly sink instructions with side-effects.
+ // Terminator or phi instructions cannot be sunk.
+ // Avoid sinking other select instructions (should be handled separetely).
+ if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
+ isa<SelectInst>(II) || isa<PHINode>(II)))
+ continue;
+
// Avoid considering instructions with less frequency than the source
// instruction (i.e., avoid colder code regions of the dependence slice).
if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
continue;
// Eligible one-use instruction added to the dependence slice.
- Slice.push_back(II);
+ Slice.push(II);
// Explore all the operands of the current instruction to expand the slice.
for (unsigned k = 0; k < II->getNumOperands(); ++k)
diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
index ee267c80e7a5e..f2dff8cf8c9f1 100644
--- a/llvm/test/CodeGen/X86/select-optimize.ll
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -105,20 +105,28 @@ define i32 @weighted_selects(i32 %a, i32 %b) !prof !19 {
; If select group predictable, turn it into a branch.
define i32 @weighted_select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) !prof !19 {
; CHECK-LABEL: @weighted_select_group(
+; CHECK-NEXT: [[A1:%.*]] = add i32 [[A:%.*]], 1
; CHECK-NEXT: [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
-; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]]
-; CHECK: select.false:
+; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]], !prof [[PROF16]]
+; CHECK: select.true.sink:
+; CHECK-NEXT: [[C1:%.*]] = add i32 [[C:%.*]], 1
+; CHECK-NEXT: br label [[SELECT_END:%.*]]
+; CHECK: select.false.sink:
+; CHECK-NEXT: [[B1:%.*]] = add i32 [[B:%.*]], 1
; CHECK-NEXT: br label [[SELECT_END]]
; CHECK: select.end:
-; CHECK-NEXT: [[SEL1:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ]
-; CHECK-NEXT: [[SEL2:%.*]] = phi i32 [ [[C:%.*]], [[TMP0]] ], [ [[A]], [[SELECT_FALSE]] ]
+; CHECK-NEXT: [[SEL1:%.*]] = phi i32 [ [[A1]], [[SELECT_TRUE_SINK]] ], [ [[B1]], [[SELECT_FALSE_SINK]] ]
+; CHECK-NEXT: [[SEL2:%.*]] = phi i32 [ [[C1]], [[SELECT_TRUE_SINK]] ], [ [[A1]], [[SELECT_FALSE_SINK]] ]
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[SEL1]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]]
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL1]], [[SEL2]]
; CHECK-NEXT: ret i32 [[ADD]]
;
- %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !15
+ %a1 = add i32 %a, 1
+ %b1 = add i32 %b, 1
+ %c1 = add i32 %c, 1
+ %sel1 = select i1 %cmp, i32 %a1, i32 %b1, !prof !15
call void @llvm.dbg.value(metadata i32 %sel1, metadata !24, metadata !DIExpression()), !dbg !DILocation(scope: !23)
- %sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !15
+ %sel2 = select i1 %cmp, i32 %c1, i32 %a1, !prof !15
%add = add i32 %sel1, %sel2
ret i32 %add
}
@@ -151,13 +159,13 @@ define i32 @select_group_intra_group(i32 %a, i32 %b, i32 %c, i1 %cmp) {
; sink load
define i32 @expensive_val_operand1(i32* nocapture %a, i32 %y, i1 %cmp) {
; CHECK-LABEL: @expensive_val_operand1(
-; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
-; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]]
-; CHECK: select.false:
+; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
+; CHECK: select.true.sink:
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
; CHECK-NEXT: br label [[SELECT_END]]
; CHECK: select.end:
-; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[LOAD]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ]
+; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[LOAD]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ]
; CHECK-NEXT: ret i32 [[SEL]]
;
%load = load i32, i32* %a, align 8
@@ -181,14 +189,14 @@ define i32 @expensive_val_operand2(i32* nocapture %a, i32 %x, i1 %cmp) {
; into a branch with sinked dependence slice.
define i32 @expensive_val_operand3(i32* nocapture %a, i32 %b, i32 %y, i1 %cmp) {
; CHECK-LABEL: @expensive_val_operand3(
+; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
+; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
+; CHECK: select.true.sink:
; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
; CHECK-NEXT: [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]]
-; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
-; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]]
-; CHECK: select.false:
; CHECK-NEXT: br label [[SELECT_END]]
; CHECK: select.end:
-; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ]
+; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ]
; CHECK-NEXT: ret i32 [[SEL]]
;
%load = load i32, i32* %a, align 8
@@ -242,14 +250,14 @@ define double @cmov_on_critical_path(i32 %n, double %x, double* nocapture %a) {
; CHECK-NEXT: [[X1:%.*]] = phi double [ [[X2:%.*]], [[SELECT_END]] ], [ [[X]], [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[R:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]]
; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt double [[X1]], [[R]]
; CHECK-NEXT: [[X2_FROZEN:%.*]] = freeze i1 [[CMP2]]
-; CHECK-NEXT: br i1 [[X2_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]], !prof [[PROF27:![0-9]+]]
-; CHECK: select.false:
+; CHECK-NEXT: br i1 [[X2_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END]], !prof [[PROF27:![0-9]+]]
+; CHECK: select.true.sink:
+; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]]
; CHECK-NEXT: br label [[SELECT_END]]
; CHECK: select.end:
-; CHECK-NEXT: [[X2]] = phi double [ [[SUB]], [[FOR_BODY]] ], [ [[X1]], [[SELECT_FALSE]] ]
+; CHECK-NEXT: [[X2]] = phi double [ [[SUB]], [[SELECT_TRUE_SINK]] ], [ [[X1]], [[FOR_BODY]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
More information about the llvm-commits
mailing list