[clang] 03bd519 - [OldPM] Pass manager: run SROA after (simple) loop unrolling

Roman Lebedev via cfe-commits cfe-commits at lists.llvm.org
Sun Oct 4 01:54:14 PDT 2020


Author: Roman Lebedev
Date: 2020-10-04T11:53:50+03:00
New Revision: 03bd5198b6f7d9f49d72e6516d813a206f3b6d0d

URL: https://github.com/llvm/llvm-project/commit/03bd5198b6f7d9f49d72e6516d813a206f3b6d0d
DIFF: https://github.com/llvm/llvm-project/commit/03bd5198b6f7d9f49d72e6516d813a206f3b6d0d.diff

LOG: [OldPM] Pass manager: run SROA after (simple) loop unrolling

I have stumbled into this pretty accidentally, when rewriting
some spaghetti-like code into something more structured,
which involved using some `std::array<>`s. And to my surprise,
the `alloca`s remained, causing about `+160%` perf regression.

https://llvm-compile-time-tracker.com/compare.php?from=bb6f4d32aac3eecb51909f4facc625219307ee68&to=d563e66f40f9d4d145cb2050e41cb961e2b37785&stat=instructions
suggests that this has geomean compile-time cost of `+0.08%`.

Note that D68593 / cecc0d27ad58c0aed8ef9ed99bbf691e137a0f26
already did this chage for NewPM, but left OldPM in a pessimized state.

This fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40011 | PR40011 ]], [[ https://bugs.llvm.org/show_bug.cgi?id=42794 | PR42794 ]] and probably some other reports.

Reviewed By: nikic, xbolva00

Differential Revision: https://reviews.llvm.org/D87972

Added: 
    

Modified: 
    clang/test/CodeGenCXX/union-tbaa2.cpp
    clang/test/Misc/loop-opt-setup.c
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
    llvm/test/Other/opt-O2-pipeline.ll
    llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
    llvm/test/Other/opt-O3-pipeline.ll
    llvm/test/Other/opt-Os-pipeline.ll
    llvm/test/Transforms/PhaseOrdering/X86/SROA-after-loop-unrolling.ll

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGenCXX/union-tbaa2.cpp b/clang/test/CodeGenCXX/union-tbaa2.cpp
index 5d13ff1ad8d9..65872d4a98ae 100644
--- a/clang/test/CodeGenCXX/union-tbaa2.cpp
+++ b/clang/test/CodeGenCXX/union-tbaa2.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -O2 -fno-experimental-new-pass-manager -std=c++11 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -target-feature +sse4.2 -target-feature +avx -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -O1 -std=c++11 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -target-feature +sse4.2 -target-feature +avx -emit-llvm -o - | FileCheck %s
 
 // Testcase from llvm.org/PR32056
 

diff  --git a/clang/test/Misc/loop-opt-setup.c b/clang/test/Misc/loop-opt-setup.c
index 868c716c6ed7..322f5e0e6d4a 100644
--- a/clang/test/Misc/loop-opt-setup.c
+++ b/clang/test/Misc/loop-opt-setup.c
@@ -1,5 +1,5 @@
-// RUN: %clang -O1 -fexperimental-new-pass-manager -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s -check-prefix=CHECK-NEWPM
-// RUN: %clang -O1 -fno-experimental-new-pass-manager -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s -check-prefix=CHECK-OLDPM
+// RUN: %clang -O1 -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s
+
 extern int a[16];
 int b = 0;
 int foo(void) {
@@ -9,10 +9,8 @@ int foo(void) {
   return b;
 }
 // Check br i1 to make sure that the loop is fully unrolled
-// CHECK-LABEL-NEWPM: foo
-// CHECK-NOT-NEWPM: br i1
-// CHECK-LABEL-OLDPM: foo
-// CHECK-NOT-OLDPM: br i1
+// CHECK-LABEL: foo
+// CHECK-NOT: br i1
 
 void Helper() {
   const int *nodes[5];
@@ -26,17 +24,7 @@ void Helper() {
 }
 
 // Check br i1 to make sure the loop is gone, there will still be a label branch for the infinite loop.
-// CHECK-LABEL-NEWPM: Helper
-// CHECK-NEWPM: br label
-// CHECK-NEWPM-NOT: br i1
-// CHECK-NEWPM: br label
-
-// The old pass manager doesn't remove the while loop so check for 5 load i32*.
-// CHECK-LABEL-OLDPM: Helper
-// CHECK-OLDPM: br label
-// CHECK-OLDPM: load i32*
-// CHECK-OLDPM: load i32*
-// CHECK-OLDPM: load i32*
-// CHECK-OLDPM: load i32*
-// CHECK-OLDPM: load i32*
-// CHECK-OLDPM: ret
+// CHECK-LABEL: Helper
+// CHECK: br label
+// CHECK-NOT: br i1
+// CHECK: br label

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ccc493640b29..043effc97f2b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -479,14 +479,6 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
       if (EnableOpt)
         PM.add(createAMDGPUPromoteAllocaToVector());
   });
-
-  Builder.addExtension(
-      PassManagerBuilder::EP_LoopOptimizerEnd,
-      [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
-        // Add SROA after loop unrolling as more promotable patterns are
-        // exposed after small loops are fully unrolled.
-        PM.add(createSROAPass());
-      });
 }
 
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index c63705a4ee94..088f1e25f3d1 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -459,6 +459,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
   // This ends the loop pass pipelines.
 
+  // Break up allocas that may now be splittable after loop unrolling.
+  MPM.add(createSROAPass());
+
   if (OptLevel > 1) {
     MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
     MPM.add(NewGVN ? createNewGVNPass()

diff  --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
index 58ed6b2a0820..967477da22bd 100644
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -mtriple=x86_64-- -O2 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s 
+; RUN: opt -enable-new-pm=0 -mtriple=x86_64-- -O2 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s
 
 ; REQUIRES: asserts
 
@@ -22,7 +22,7 @@
 ; CHECK-NEXT: Target Library Information
 ; CHECK-NEXT: Target Transform Information
 ;             Target Pass Configuration
-; CHECK:      Type-Based Alias Analysis 
+; CHECK:      Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Profile summary info
@@ -134,6 +134,8 @@
 ; CHECK-NEXT:           Recognize loop idioms
 ; CHECK-NEXT:           Delete dead loops
 ; CHECK-NEXT:           Unroll loops
+; CHECK-NEXT:         SROA
+; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         MergedLoadStoreMotion
 ; CHECK-NEXT:         Phi Values Analysis
 ; CHECK-NEXT:         Function Alias Analysis Results

diff  --git a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
index 493957e865d4..3b8db87e8fb1 100644
--- a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
+++ b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
@@ -139,6 +139,8 @@
 ; CHECK-NEXT:           Recognize loop idioms
 ; CHECK-NEXT:           Delete dead loops
 ; CHECK-NEXT:           Unroll loops
+; CHECK-NEXT:         SROA
+; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         MergedLoadStoreMotion
 ; CHECK-NEXT:         Phi Values Analysis
 ; CHECK-NEXT:         Function Alias Analysis Results

diff  --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
index f674dabd5217..a53db61a93cf 100644
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -139,6 +139,8 @@
 ; CHECK-NEXT:           Recognize loop idioms
 ; CHECK-NEXT:           Delete dead loops
 ; CHECK-NEXT:           Unroll loops
+; CHECK-NEXT:         SROA
+; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         MergedLoadStoreMotion
 ; CHECK-NEXT:         Phi Values Analysis
 ; CHECK-NEXT:         Function Alias Analysis Results

diff  --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
index 66df666a64c6..93c2d121255b 100644
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -120,6 +120,8 @@
 ; CHECK-NEXT:           Recognize loop idioms
 ; CHECK-NEXT:           Delete dead loops
 ; CHECK-NEXT:           Unroll loops
+; CHECK-NEXT:         SROA
+; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         MergedLoadStoreMotion
 ; CHECK-NEXT:         Phi Values Analysis
 ; CHECK-NEXT:         Function Alias Analysis Results

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-loop-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-loop-unrolling.ll
index 8c8a80cbf7ff..22694901162c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-loop-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-loop-unrolling.ll
@@ -22,55 +22,21 @@ target triple = "x86_64-unknown-linux-gnu"
 %"struct.std::array" = type { [6 x i32] }
 
 define dso_local void @_Z3fooi(i32 %cnt) {
-; OLDPM-LABEL: @_Z3fooi(
-; OLDPM-NEXT:  entry:
-; OLDPM-NEXT:    [[ARR:%.*]] = alloca %"struct.std::array", align 16
-; OLDPM-NEXT:    [[TMP0:%.*]] = bitcast %"struct.std::array"* [[ARR]] to i8*
-; OLDPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull [[TMP0]])
-; OLDPM-NEXT:    [[ARRAYDECAY_I_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 0
-; OLDPM-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 1
-; OLDPM-NEXT:    [[INCDEC_PTR_1:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 2
-; OLDPM-NEXT:    [[INCDEC_PTR_2:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 3
-; OLDPM-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CNT:%.*]], i32 0
-; OLDPM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> zeroinitializer
-; OLDPM-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 2, i32 3, i32 4>
-; OLDPM-NEXT:    [[TMP4:%.*]] = bitcast %"struct.std::array"* [[ARR]] to <4 x i32>*
-; OLDPM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16
-; OLDPM-NEXT:    [[INCDEC_PTR_3:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 4
-; OLDPM-NEXT:    [[INC_4:%.*]] = add nsw i32 [[CNT]], 5
-; OLDPM-NEXT:    store i32 [[INC_4]], i32* [[INCDEC_PTR_3]], align 16
-; OLDPM-NEXT:    [[INCDEC_PTR_4:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 5
-; OLDPM-NEXT:    [[INC_5:%.*]] = add nsw i32 [[CNT]], 6
-; OLDPM-NEXT:    store i32 [[INC_5]], i32* [[INCDEC_PTR_4]], align 4
-; OLDPM-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYDECAY_I_I_I]], align 16
-; OLDPM-NEXT:    call void @_Z3usei(i32 [[TMP5]])
-; OLDPM-NEXT:    [[TMP6:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; OLDPM-NEXT:    call void @_Z3usei(i32 [[TMP6]])
-; OLDPM-NEXT:    [[TMP7:%.*]] = load i32, i32* [[INCDEC_PTR_1]], align 8
-; OLDPM-NEXT:    call void @_Z3usei(i32 [[TMP7]])
-; OLDPM-NEXT:    [[TMP8:%.*]] = load i32, i32* [[INCDEC_PTR_2]], align 4
-; OLDPM-NEXT:    call void @_Z3usei(i32 [[TMP8]])
-; OLDPM-NEXT:    [[TMP9:%.*]] = load i32, i32* [[INCDEC_PTR_3]], align 16
-; OLDPM-NEXT:    call void @_Z3usei(i32 [[TMP9]])
-; OLDPM-NEXT:    call void @_Z3usei(i32 [[INC_5]])
-; OLDPM-NEXT:    call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull [[TMP0]])
-; OLDPM-NEXT:    ret void
-;
-; NEWPM-LABEL: @_Z3fooi(
-; NEWPM-NEXT:  entry:
-; NEWPM-NEXT:    [[INC:%.*]] = add nsw i32 [[CNT:%.*]], 1
-; NEWPM-NEXT:    [[INC_1:%.*]] = add nsw i32 [[CNT]], 2
-; NEWPM-NEXT:    [[INC_2:%.*]] = add nsw i32 [[CNT]], 3
-; NEWPM-NEXT:    [[INC_3:%.*]] = add nsw i32 [[CNT]], 4
-; NEWPM-NEXT:    [[INC_4:%.*]] = add nsw i32 [[CNT]], 5
-; NEWPM-NEXT:    [[INC_5:%.*]] = add nsw i32 [[CNT]], 6
-; NEWPM-NEXT:    call void @_Z3usei(i32 [[INC]])
-; NEWPM-NEXT:    call void @_Z3usei(i32 [[INC_1]])
-; NEWPM-NEXT:    call void @_Z3usei(i32 [[INC_2]])
-; NEWPM-NEXT:    call void @_Z3usei(i32 [[INC_3]])
-; NEWPM-NEXT:    call void @_Z3usei(i32 [[INC_4]])
-; NEWPM-NEXT:    call void @_Z3usei(i32 [[INC_5]])
-; NEWPM-NEXT:    ret void
+; CHECK-LABEL: @_Z3fooi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[CNT:%.*]], 1
+; CHECK-NEXT:    [[INC_1:%.*]] = add nsw i32 [[CNT]], 2
+; CHECK-NEXT:    [[INC_2:%.*]] = add nsw i32 [[CNT]], 3
+; CHECK-NEXT:    [[INC_3:%.*]] = add nsw i32 [[CNT]], 4
+; CHECK-NEXT:    [[INC_4:%.*]] = add nsw i32 [[CNT]], 5
+; CHECK-NEXT:    [[INC_5:%.*]] = add nsw i32 [[CNT]], 6
+; CHECK-NEXT:    call void @_Z3usei(i32 [[INC]])
+; CHECK-NEXT:    call void @_Z3usei(i32 [[INC_1]])
+; CHECK-NEXT:    call void @_Z3usei(i32 [[INC_2]])
+; CHECK-NEXT:    call void @_Z3usei(i32 [[INC_3]])
+; CHECK-NEXT:    call void @_Z3usei(i32 [[INC_4]])
+; CHECK-NEXT:    call void @_Z3usei(i32 [[INC_5]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cnt.addr = alloca i32


        


More information about the cfe-commits mailing list