[llvm] f269ec2 - [LoopFlatten] Move it from LPM2 to LPM1
Sjoerd Meijer via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 19 06:38:34 PST 2022
Author: Sjoerd Meijer
Date: 2022-01-19T14:38:05Z
New Revision: f269ec230e2b65df165009b85f1e8cae2aeb1bba
URL: https://github.com/llvm/llvm-project/commit/f269ec230e2b65df165009b85f1e8cae2aeb1bba
DIFF: https://github.com/llvm/llvm-project/commit/f269ec230e2b65df165009b85f1e8cae2aeb1bba.diff
LOG: [LoopFlatten] Move it from LPM2 to LPM1
In D110057 we moved LoopFlatten to a LoopPassManager. This caused a performance
regression for our 64-bit targets (the 32-bit were unaffected), the pass is no
longer triggering for a motivating example. The reason is that the IR is just
very different than expected; we try to match loop statements and particular
uses of induction variables. The easiest is to just move LoopFlatten to a place
in the pipeline where the IR is as expected, which is just before
IndVarSimplify. This means we move it from LPM2 to LPM1, so that it actually
runs just a bit earlier from where it was running before. IndVarSimplify is
responsible for significant rewrites that are difficult to "look through" in
LoopFlatten.
Differential Revision: https://reviews.llvm.org/D116612
Added:
llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
Modified:
llvm/lib/Passes/PassBuilderPipelines.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 79c42b886fbbe..6110bda02406d 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -300,6 +300,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
// TODO: Investigate promotion cap for O1.
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
LPM1.addPass(SimpleLoopUnswitchPass());
+ if (EnableLoopFlatten)
+ LPM1.addPass(LoopFlattenPass());
LPM2.addPass(LoopIdiomRecognizePass());
LPM2.addPass(IndVarSimplifyPass());
@@ -311,8 +313,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
if (EnableLoopInterchange)
LPM2.addPass(LoopInterchangePass());
- if (EnableLoopFlatten)
- LPM2.addPass(LoopFlattenPass());
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
// because it changes IR to makes profile annotation in back compile
@@ -475,6 +475,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
LPM1.addPass(
SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 &&
EnableO3NonTrivialUnswitching));
+ if (EnableLoopFlatten)
+ LPM1.addPass(LoopFlattenPass());
+
LPM2.addPass(LoopIdiomRecognizePass());
LPM2.addPass(IndVarSimplifyPass());
@@ -485,8 +488,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
if (EnableLoopInterchange)
LPM2.addPass(LoopInterchangePass());
- if (EnableLoopFlatten)
- LPM2.addPass(LoopFlattenPass());
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
// because it changes IR to makes profile annotation in back compile
@@ -1628,10 +1629,10 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
MainFPM.addPass(ConstraintEliminationPass());
LoopPassManager LPM;
- LPM.addPass(IndVarSimplifyPass());
- LPM.addPass(LoopDeletionPass());
if (EnableLoopFlatten && Level.getSpeedupLevel() > 1)
LPM.addPass(LoopFlattenPass());
+ LPM.addPass(IndVarSimplifyPass());
+ LPM.addPass(LoopDeletionPass());
// FIXME: Add loop interchange.
// Unroll small loops and perform peeling.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
new file mode 100644
index 0000000000000..5ef5bb185ab02
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='default<O3>' -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+define dso_local void @_Z3fooPiii(i32* %A, i32 %N, i32 %M) #0 {
+; CHECK-LABEL: @_Z3fooPiii(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP3]], i1 [[CMP21]], i1 false
+; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.lr.ph.split.us:
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[M]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = mul nuw nsw i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]]
+; CHECK: for.cond1.preheader.us:
+; CHECK-NEXT: [[INDVAR6:%.*]] = phi i64 [ [[INDVAR_NEXT7:%.*]], [[FOR_COND1_PREHEADER_US]] ], [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US]] ]
+; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVAR6]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4
+; CHECK-NEXT: tail call void @_Z1fi(i32 [[TMP2]])
+; CHECK-NEXT: [[INDVAR_NEXT7]] = add nuw nsw i64 [[INDVAR6]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVAR_NEXT7]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.cond
+
+for.cond:
+ %i.0 = phi i32 [ 0, %entry ], [ %inc6, %for.cond.cleanup3 ]
+ %cmp = icmp slt i32 %i.0, %N
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ br label %for.cond1
+
+for.cond1:
+ %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ]
+ %cmp2 = icmp slt i32 %j.0, %M
+ br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+
+for.cond.cleanup3:
+ %inc6 = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.body4:
+ %mul = mul nsw i32 %i.0, %M
+ %add = add nsw i32 %mul, %j.0
+ %idxprom = sext i32 %add to i64
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom
+ %0 = load i32, i32* %arrayidx, align 4
+ call void @_Z1fi(i32 %0)
+ %inc = add nsw i32 %j.0, 1
+ br label %for.cond1
+}
+
+declare dso_local void @_Z1fi(i32) #2
More information about the llvm-commits
mailing list