[llvm] ed9df5b - [Passes] Run sinking/hoisting in SimplifyCFG earlier.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 30 04:35:46 PDT 2021
Author: Florian Hahn
Date: 2021-04-30T12:23:57+01:00
New Revision: ed9df5bd2f50b2199204cc6e50910ba52dd5e93e
URL: https://github.com/llvm/llvm-project/commit/ed9df5bd2f50b2199204cc6e50910ba52dd5e93e
DIFF: https://github.com/llvm/llvm-project/commit/ed9df5bd2f50b2199204cc6e50910ba52dd5e93e.diff
LOG: [Passes] Run sinking/hoisting in SimplifyCFG earlier.
Hoisting and sinking instructions out of conditional blocks enables
additional vectorization by:
1. Executing memory accesses unconditionally.
2. Reducing the number of instructions that need predication.
After disabling early hoisting / sinking, we miss out on a few
vectorization opportunities. One of those is causing a ~10% performance
regression in one of the Geekbench benchmarks on AArch64.
This patch tires to recover the regression by running hoisting/sinking
as part of a SimplifyCFG run after LoopRotate and before LoopVectorize.
Note that in the legacy pass-manager, we run LoopRotate just before
vectorization again and there's no SimplifyCFG run in between, so the
sinking/hoisting may impact the later run on LoopRotate. But the impact
should be limited and the benefit of hosting/sinking at this stage
should outweigh the risk of not rotating.
Compile-time impact looks slightly positive for most cases.
http://llvm-compile-time-tracker.com/compare.php?from=2ea7fb7b1c045a7d60fcccf3df3ebb26aa3699e5&to=e58b4a763c691da651f25996aad619cb3d946faf&stat=instructions
NewPM-O3: geomean -0.19%
NewPM-ReleaseThinLTO: geoman -0.54%
NewPM-ReleaseLTO-g: geomean -0.03%
With a few benchmarks seeing a notable increase, but also some
improvements.
Alternative to D101290.
Reviewed By: lebedev.ri
Differential Revision: https://reviews.llvm.org/D101468
Added:
Modified:
llvm/lib/Passes/PassBuilder.cpp
llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
llvm/test/Transforms/PGOProfile/Inputs/thinlto_cspgo_bar_use.ll
llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll
llvm/test/Transforms/PGOProfile/thinlto_cspgo_use.ll
llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
Removed:
################################################################################
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index b8f6a200bf496..31e9e14a25e9e 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -847,7 +847,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
for (auto &C : ScalarOptimizerLateEPCallbacks)
C(FPM, Level);
- FPM.addPass(SimplifyCFGPass());
+ FPM.addPass(SimplifyCFGPass(
+ SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true)));
FPM.addPass(InstCombinePass());
invokePeepholeEPCallbacks(FPM, Level);
@@ -1322,8 +1323,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
// convert to more optimized IR using more aggressive simplify CFG options.
// The extra sinking transform can create larger basic blocks, so do this
// before SLP vectorization.
- // FIXME: study whether hoisting and/or sinking of common instructions should
- // be delayed until after SLP vectorizer.
OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index c18e868e7a46d..1e752462da819 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -509,7 +509,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
if (RerollLoops)
MPM.add(createLoopRerollPass());
- MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+ // Merge & remove BBs and sink & hoist common instructions.
+ MPM.add(createCFGSimplificationPass(
+ SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true)));
// Clean up after everything.
MPM.add(createInstructionCombiningPass());
addExtensionsToPM(EP_Peephole, MPM);
@@ -823,8 +825,6 @@ void PassManagerBuilder::populateModulePassManager(
// convert to more optimized IR using more aggressive simplify CFG options.
// The extra sinking transform can create larger basic blocks, so do this
// before SLP vectorization.
- // FIXME: study whether hoisting and/or sinking of common instructions should
- // be delayed until after SLP vectorizer.
MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_cspgo_bar_use.ll b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_cspgo_bar_use.ll
index ed1765f00b3a6..80edc4ee954e7 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_cspgo_bar_use.ll
+++ b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_cspgo_bar_use.ll
@@ -11,9 +11,12 @@ entry:
br i1 %tobool, label %if.else, label %if.then, !prof !30
if.then:
+ ; The calls here ensure that the instructions are not hoisted by SimplifyCFG.
+ call void @clobber()
%0 = load i32, i32* @odd, align 4
%inc = add i32 %0, 1
store i32 %inc, i32* @odd, align 4
+ call void @clobber()
br label %if.end
if.else:
@@ -26,6 +29,8 @@ if.end:
ret void
}
+declare void @clobber()
+
define internal fastcc i32 @cond(i32 %i) #1 !prof !29 !PGOFuncName !35 {
entry:
%rem = srem i32 %i, 2
diff --git a/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll b/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll
index 8a21b1bef86c1..4b5af587109de 100644
--- a/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll
+++ b/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll
@@ -103,10 +103,10 @@ for.end:
ret void
}
; CSPGOSUMMARY-LABEL: @foo
-; CSPGOSUMMARY: %even.sink{{[0-9]*}} = select i1 %tobool.i{{[0-9]*}}, i32* @even, i32* @odd
-; CSPGOSUMMARY-SAME: !prof ![[BW1_CSPGO_FOO:[0-9]+]]
-; CSPGOSUMMARY: %even.sink{{[0-9]*}} = select i1 %tobool.i{{[0-9]*}}, i32* @even, i32* @odd
-; CSPGOSUMMARY-SAME: !prof ![[BW2_CSPGO_FOO:[0-9]+]]
+; CSPGOSUMMARY: %odd.sink.i{{[0-9]*}} = select i1 %tobool.i{{[0-9]*}}, i32* @even, i32* @odd
+; CSPGOSUMMARY-SAME: !prof ![[BW_CSPGO_BAR]]
+; CSPGOSUMMARY: %odd.sink.i{{[0-9]*}} = select i1 %tobool.i{{[0-9]*}}, i32* @even, i32* @odd
+; CSPGOSUMMARY-SAME: !prof ![[BW_CSPGO_BAR]]
declare dso_local i32 @bar_m(i32)
declare dso_local i32 @bar_m2(i32)
@@ -152,5 +152,3 @@ entry:
; CSPGOSUMMARY: {{![0-9]+}} = !{!"MaxFunctionCount", i64 200000}
; CSPGOSUMMARY: {{![0-9]+}} = !{!"NumCounts", i64 23}
; CSPGOSUMMARY-DAG: ![[BW_CSPGO_BAR]] = !{!"branch_weights", i32 100000, i32 100000}
-; CSPGOSUMMARY-DAG: ![[BW1_CSPGO_FOO]] = !{!"branch_weights", i32 100000, i32 0}
-; CSPGOSUMMARY-DAG: ![[BW2_CSPGO_FOO]] = !{!"branch_weights", i32 0, i32 100000}
diff --git a/llvm/test/Transforms/PGOProfile/thinlto_cspgo_use.ll b/llvm/test/Transforms/PGOProfile/thinlto_cspgo_use.ll
index 357115dc6501c..6d35946a28ff3 100644
--- a/llvm/test/Transforms/PGOProfile/thinlto_cspgo_use.ll
+++ b/llvm/test/Transforms/PGOProfile/thinlto_cspgo_use.ll
@@ -8,6 +8,7 @@
; RUN: -r=%t1.bc,bar,l \
; RUN: -r=%t1.bc,main,plx \
; RUN: -r=%t2.bc,bar,pl \
+; RUN: -r=%t2.bc,clobber,pl \
; RUN: -r=%t2.bc,odd,pl \
; RUN: -r=%t2.bc,even,pl
; RUN: llvm-dis %t.1.4.opt.bc -o - | FileCheck %s --check-prefix=CSUSE
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index d23571c73d88a..c726b62260b66 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -140,16 +140,61 @@ for.end: ; preds = %for.cond.cleanup
define void @loop2(float* %A, float* %B, i32* %C, float %x) {
; CHECK-LABEL: @loop2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP_BODY:%.*]]
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
+; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 10000
+; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP6]] to float*
+; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[B]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[C]]
+; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT: [[BOUND011:%.*]] = icmp ugt float* [[SCEVGEP9]], [[B]]
+; CHECK-NEXT: [[BOUND112:%.*]] = icmp ugt float* [[SCEVGEP]], [[A]]
+; CHECK-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]]
+; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT13]]
+; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[LOOP_BODY:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[DOT0:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 0
+; CHECK-NEXT: [[DOT017:%.*]] = getelementptr inbounds float, float* [[A]], i64 0
+; CHECK-NEXT: [[DOT018:%.*]] = getelementptr inbounds float, float* [[B]], i64 0
+; CHECK-NEXT: [[INDEX_NEXT_0:%.*]] = add i64 0, 4
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX_NEXT_PHI:%.*]] = phi i64 [ [[INDEX_NEXT_0]], [[VECTOR_PH]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY_VECTOR_BODY_CRIT_EDGE:%.*]] ]
+; CHECK-NEXT: [[DOTPHI:%.*]] = phi float* [ [[DOT018]], [[VECTOR_PH]] ], [ [[DOT120:%.*]], [[VECTOR_BODY_VECTOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT: [[DOTPHI21:%.*]] = phi float* [ [[DOT017]], [[VECTOR_PH]] ], [ [[DOT119:%.*]], [[VECTOR_BODY_VECTOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT: [[DOTPHI22:%.*]] = phi i32* [ [[DOT0]], [[VECTOR_PH]] ], [ [[DOT1:%.*]], [[VECTOR_BODY_VECTOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[DOTPHI22]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 20, i32 20, i32 20, i32 20>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[DOTPHI21]] to <4 x float>*
+; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[WIDE_LOAD14]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[DOTPHI]] to <4 x float>*
+; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !alias.scope !13, !noalias !15
+; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP5]], [[WIDE_LOAD15]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP5]], <4 x float> [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[DOTPHI]] to <4 x float>*
+; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP8]], align 4, !alias.scope !13, !noalias !15
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT_PHI]], 10000
+; CHECK-NEXT: br i1 [[TMP9]], label [[EXIT:%.*]], label [[VECTOR_BODY_VECTOR_BODY_CRIT_EDGE]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK: vector.body.vector.body_crit_edge:
+; CHECK-NEXT: [[DOT1]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX_NEXT_PHI]]
+; CHECK-NEXT: [[DOT119]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT_PHI]]
+; CHECK-NEXT: [[DOT120]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX_NEXT_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT_1]] = add i64 [[INDEX_NEXT_PHI]], 4
+; CHECK-NEXT: br label [[VECTOR_BODY]]
; CHECK: loop.body:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[C_GEP:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[IV1]]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[C_GEP:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[IV1]]
; CHECK-NEXT: [[C_LV:%.*]] = load i32, i32* [[C_GEP]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[C_LV]], 20
-; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV1]]
+; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV1]]
; CHECK-NEXT: [[A_LV_0:%.*]] = load float, float* [[A_GEP_0]], align 4
-; CHECK-NEXT: [[MUL2_I81_I:%.*]] = fmul float [[A_LV_0]], [[X:%.*]]
-; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV1]]
+; CHECK-NEXT: [[MUL2_I81_I:%.*]] = fmul float [[A_LV_0]], [[X]]
+; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV1]]
; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_LATCH]], label [[ELSE:%.*]]
; CHECK: else:
; CHECK-NEXT: [[B_LV:%.*]] = load float, float* [[B_GEP_0]], align 4
@@ -160,7 +205,7 @@ define void @loop2(float* %A, float* %B, i32* %C, float %x) {
; CHECK-NEXT: store float [[ADD_SINK]], float* [[B_GEP_0]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[CMP_0:%.*]] = icmp ult i64 [[IV1]], 9999
-; CHECK-NEXT: br i1 [[CMP_0]], label [[LOOP_BODY]], label [[EXIT:%.*]]
+; CHECK-NEXT: br i1 [[CMP_0]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
More information about the llvm-commits
mailing list