[llvm] 503deec - Temporairly revert "[SimplifyCFG][LoopRotate] SimplifyCFG: disable common instruction hoisting by default, enable late in pipeline"
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 21 14:33:41 PDT 2020
Author: Roman Lebedev
Date: 2020-08-22T00:33:22+03:00
New Revision: 503deec2183d466dad64b763bab4e15fd8804239
URL: https://github.com/llvm/llvm-project/commit/503deec2183d466dad64b763bab4e15fd8804239
DIFF: https://github.com/llvm/llvm-project/commit/503deec2183d466dad64b763bab4e15fd8804239.diff
LOG: Temporairly revert "[SimplifyCFG][LoopRotate] SimplifyCFG: disable common instruction hoisting by default, enable late in pipeline"
As disscussed in post-commit review starting with
https://reviews.llvm.org/D84108#2227365
while this appears to be mostly a win overall, especially code-size-wise,
this appears to shake //certain// code pattens in a way that is extremely
unfavorable for performance (+30% runtime regression)
on certain CPU's (i personally can't reproduce).
So until the behaviour is better understood, and a path forward is mapped,
let's back this out for now.
This reverts commit 1d51dc38d89bd33fb8874e242ab87b265b4dec1c.
Added:
Modified:
llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
llvm/lib/Passes/PassBuilder.cpp
llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
llvm/lib/Target/ARM/ARMTargetMachine.cpp
llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
llvm/test/Transforms/PGOProfile/chr.ll
llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
index fb3a7490346f..46f6ca0462f8 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -25,7 +25,7 @@ struct SimplifyCFGOptions {
bool ForwardSwitchCondToPhi = false;
bool ConvertSwitchToLookupTable = false;
bool NeedCanonicalLoop = true;
- bool HoistCommonInsts = false;
+ bool HoistCommonInsts = true;
bool SinkCommonInsts = false;
bool SimplifyCondBranch = true;
bool FoldTwoEntryPHINode = true;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 010567f1c7cd..23288bb1ac07 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1147,14 +1147,11 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
// convert to more optimized IR using more aggressive simplify CFG options.
// The extra sinking transform can create larger basic blocks, so do this
// before SLP vectorization.
- // FIXME: study whether hoisting and/or sinking of common instructions should
- // be delayed until after SLP vectorizer.
- OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
- .forwardSwitchCondToPhi(true)
- .convertSwitchToLookupTable(true)
- .needCanonicalLoops(false)
- .hoistCommonInsts(true)
- .sinkCommonInsts(true)));
+ OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions().
+ forwardSwitchCondToPhi(true).
+ convertSwitchToLookupTable(true).
+ needCanonicalLoops(false).
+ sinkCommonInsts(true)));
// Optimize parallel scalar instruction chains into SIMD instructions.
if (PTO.SLPVectorization)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 819f7b614106..40d71def6d09 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -457,7 +457,6 @@ void AArch64PassConfig::addIRPasses() {
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
- .hoistCommonInsts(true)
.sinkCommonInsts(true)));
// Run LoopDataPrefetch
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 87a106474c5e..b316b1041f2c 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -409,8 +409,7 @@ void ARMPassConfig::addIRPasses() {
// ldrex/strex loops to simplify this, but it needs tidying up.
if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
addPass(createCFGSimplificationPass(
- SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true),
- [this](const Function &F) {
+ SimplifyCFGOptions().sinkCommonInsts(true), [this](const Function &F) {
const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F);
return ST.hasAnyDataBarrier() && !ST.isThumb1Only();
}));
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 9bcdc89f2956..48b817d1341d 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -329,7 +329,6 @@ void HexagonPassConfig::addIRPasses() {
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
- .hoistCommonInsts(true)
.sinkCommonInsts(true)));
if (EnableLoopPrefetch)
addPass(createLoopDataPrefetchPass());
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 22daa8e812f6..c045c277706b 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -784,13 +784,10 @@ void PassManagerBuilder::populateModulePassManager(
// convert to more optimized IR using more aggressive simplify CFG options.
// The extra sinking transform can create larger basic blocks, so do this
// before SLP vectorization.
- // FIXME: study whether hoisting and/or sinking of common instructions should
- // be delayed until after SLP vectorizer.
MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
- .hoistCommonInsts(true)
.sinkCommonInsts(true)));
if (SLPVectorize) {
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index b0435bf6e4ea..db5211df397a 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -63,8 +63,8 @@ static cl::opt<bool> UserForwardSwitchCond(
cl::desc("Forward switch condition to phi ops (default = false)"));
static cl::opt<bool> UserHoistCommonInsts(
- "hoist-common-insts", cl::Hidden, cl::init(false),
- cl::desc("hoist common instructions (default = false)"));
+ "hoist-common-insts", cl::Hidden, cl::init(true),
+ cl::desc("hoist common instructions (default = true)"));
static cl::opt<bool> UserSinkCommonInsts(
"sink-common-insts", cl::Hidden, cl::init(false),
diff --git a/llvm/test/Transforms/PGOProfile/chr.ll b/llvm/test/Transforms/PGOProfile/chr.ll
index 776501402b19..e2161d617022 100644
--- a/llvm/test/Transforms/PGOProfile/chr.ll
+++ b/llvm/test/Transforms/PGOProfile/chr.ll
@@ -2008,16 +2008,9 @@ define i64 @test_chr_22(i1 %i, i64* %j, i64 %v0) !prof !14 {
; CHECK-NEXT: bb0:
; CHECK-NEXT: [[REASS_ADD:%.*]] = shl i64 [[V0:%.*]], 1
; CHECK-NEXT: [[V2:%.*]] = add i64 [[REASS_ADD]], 3
-; CHECK-NEXT: [[C1:%.*]] = icmp slt i64 [[V2]], 100
-; CHECK-NEXT: br i1 [[C1]], label [[BB0_SPLIT:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15
-; CHECK: bb0.split:
; CHECK-NEXT: [[V299:%.*]] = mul i64 [[V2]], 7860086430977039991
; CHECK-NEXT: store i64 [[V299]], i64* [[J:%.*]], align 4
; CHECK-NEXT: ret i64 99
-; CHECK: bb0.split.nonchr:
-; CHECK-NEXT: [[V299_NONCHR:%.*]] = mul i64 [[V2]], 7860086430977039991
-; CHECK-NEXT: store i64 [[V299_NONCHR]], i64* [[J]], align 4
-; CHECK-NEXT: ret i64 99
;
bb0:
%v1 = add i64 %v0, 3
diff --git a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
index 314af1c14145..1d8cce6879e9 100644
--- a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
+++ b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
@@ -5,11 +5,14 @@
; RUN: opt -O3 -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK2
; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK3
-; RUN: opt -O3 -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_OLDPM,FALLBACK4
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_NEWPM,FALLBACK5
+; RUN: opt -O3 -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=HOIST,THR2,FALLBACK4
+; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=HOIST,THR2,FALLBACK5
-; RUN: opt -O3 -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_OLDPM,FALLBACK6
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_NEWPM,FALLBACK7
+; RUN: opt -O3 -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_OLDPM,FALLBACK6
+; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_NEWPM,FALLBACK7
+
+; RUN: opt -O3 -rotation-max-header-size=4 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_OLDPM,FALLBACK8
+; RUN: opt -passes='default<O3>' -rotation-max-header-size=4 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_NEWPM,FALLBACK9
; This example is produced from a very basic C code:
;
@@ -58,8 +61,8 @@ define void @_Z4loopi(i32 %width) {
; HOIST-NEXT: br label [[FOR_COND:%.*]]
; HOIST: for.cond:
; HOIST-NEXT: [[I_0:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ]
-; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[TMP0]]
; HOIST-NEXT: tail call void @f0()
+; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[TMP0]]
; HOIST-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
; HOIST: for.cond.cleanup:
; HOIST-NEXT: tail call void @f2()
@@ -77,17 +80,17 @@ define void @_Z4loopi(i32 %width) {
; ROTATED_LATER_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; ROTATED_LATER_OLDPM: for.cond.preheader:
; ROTATED_LATER_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
+; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0
; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
; ROTATED_LATER_OLDPM: for.cond.cleanup:
-; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
; ROTATED_LATER_OLDPM-NEXT: tail call void @f2()
; ROTATED_LATER_OLDPM-NEXT: br label [[RETURN]]
; ROTATED_LATER_OLDPM: for.body:
; ROTATED_LATER_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ]
-; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
; ROTATED_LATER_OLDPM-NEXT: tail call void @f1()
; ROTATED_LATER_OLDPM-NEXT: [[INC]] = add nuw i32 [[I_04]], 1
+; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
; ROTATED_LATER_OLDPM: return:
@@ -99,19 +102,19 @@ define void @_Z4loopi(i32 %width) {
; ROTATED_LATER_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; ROTATED_LATER_NEWPM: for.cond.preheader:
; ROTATED_LATER_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
+; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0
; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE:%.*]]
; ROTATED_LATER_NEWPM: for.cond.preheader.for.body_crit_edge:
; ROTATED_LATER_NEWPM-NEXT: [[INC_1:%.*]] = add nuw i32 0, 1
; ROTATED_LATER_NEWPM-NEXT: br label [[FOR_BODY:%.*]]
; ROTATED_LATER_NEWPM: for.cond.cleanup:
-; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
; ROTATED_LATER_NEWPM-NEXT: tail call void @f2()
; ROTATED_LATER_NEWPM-NEXT: br label [[RETURN]]
; ROTATED_LATER_NEWPM: for.body:
; ROTATED_LATER_NEWPM-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE]] ]
-; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
; ROTATED_LATER_NEWPM-NEXT: tail call void @f1()
+; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]]
; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]]
; ROTATED_LATER_NEWPM: for.body.for.body_crit_edge:
@@ -126,19 +129,19 @@ define void @_Z4loopi(i32 %width) {
; ROTATE_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; ROTATE_OLDPM: for.cond.preheader:
; ROTATE_OLDPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
+; ROTATE_OLDPM-NEXT: tail call void @f0()
; ROTATE_OLDPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; ROTATE_OLDPM: for.body.preheader:
; ROTATE_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
; ROTATE_OLDPM-NEXT: br label [[FOR_BODY:%.*]]
; ROTATE_OLDPM: for.cond.cleanup:
-; ROTATE_OLDPM-NEXT: tail call void @f0()
; ROTATE_OLDPM-NEXT: tail call void @f2()
; ROTATE_OLDPM-NEXT: br label [[RETURN]]
; ROTATE_OLDPM: for.body:
; ROTATE_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; ROTATE_OLDPM-NEXT: tail call void @f0()
; ROTATE_OLDPM-NEXT: tail call void @f1()
; ROTATE_OLDPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
+; ROTATE_OLDPM-NEXT: tail call void @f0()
; ROTATE_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
; ROTATE_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
; ROTATE_OLDPM: return:
@@ -150,19 +153,19 @@ define void @_Z4loopi(i32 %width) {
; ROTATE_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; ROTATE_NEWPM: for.cond.preheader:
; ROTATE_NEWPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
+; ROTATE_NEWPM-NEXT: tail call void @f0()
; ROTATE_NEWPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; ROTATE_NEWPM: for.body.preheader:
; ROTATE_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
; ROTATE_NEWPM-NEXT: [[INC_1:%.*]] = add nuw nsw i32 0, 1
; ROTATE_NEWPM-NEXT: br label [[FOR_BODY:%.*]]
; ROTATE_NEWPM: for.cond.cleanup:
-; ROTATE_NEWPM-NEXT: tail call void @f0()
; ROTATE_NEWPM-NEXT: tail call void @f2()
; ROTATE_NEWPM-NEXT: br label [[RETURN]]
; ROTATE_NEWPM: for.body:
; ROTATE_NEWPM-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_BODY_PREHEADER]] ]
-; ROTATE_NEWPM-NEXT: tail call void @f0()
; ROTATE_NEWPM-NEXT: tail call void @f1()
+; ROTATE_NEWPM-NEXT: tail call void @f0()
; ROTATE_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]]
; ROTATE_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]]
; ROTATE_NEWPM: for.body.for.body_crit_edge:
diff --git a/llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll b/llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll
index 37cbc4640e41..b58017ba7ef0 100644
--- a/llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll
+++ b/llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -simplifycfg -hoist-common-insts=1 -S < %s | FileCheck %s --check-prefixes=HOIST
; RUN: opt -simplifycfg -hoist-common-insts=0 -S < %s | FileCheck %s --check-prefixes=NOHOIST
-; RUN: opt -simplifycfg -S < %s | FileCheck %s --check-prefixes=NOHOIST,DEFAULT
+; RUN: opt -simplifycfg -S < %s | FileCheck %s --check-prefixes=HOIST,DEFAULT
; This example is produced from a very basic C code:
;
More information about the llvm-commits
mailing list