[llvm] 09fe84a - [AMDGPU] Move code sinking before structurizer

Tue May 11 05:34:46 PDT 2021

Author: Piotr Sobczak
Date: 2021-05-11T14:07:23+02:00
New Revision: 09fe84abb4ee71f707c3ec8e960a42d8292f6211

URL: https://github.com/llvm/llvm-project/commit/09fe84abb4ee71f707c3ec8e960a42d8292f6211
DIFF: https://github.com/llvm/llvm-project/commit/09fe84abb4ee71f707c3ec8e960a42d8292f6211.diff

LOG: [AMDGPU] Move code sinking before structurizer

Moving code sinking pass before structurizer creates more sinking
opportunities.

The extra flow edges introduced by the structurizer can have adverse
effects on sinking, because the sinking pass prefers moving instructions
to blocks with unique predecessors and the structurizer destroys that
property in some cases.

A notable example is moving high-latency image instructions across kills.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D101115

Added: 
    llvm/test/CodeGen/AMDGPU/sink-image-sample.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
    llvm/test/CodeGen/AMDGPU/multilevel-break.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3db6a16338ad7..8d702e4429091 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1041,6 +1041,7 @@ bool GCNPassConfig::addPreISel() {
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
 
+  addPass(createSinkingPass());
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
   addPass(&AMDGPUUnifyDivergentExitNodesID);
@@ -1051,7 +1052,6 @@ bool GCNPassConfig::addPreISel() {
     }
     addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
   }
-  addPass(createSinkingPass());
   addPass(createAMDGPUAnnotateUniformValues());
   if (!LateCFGStructurize) {
     addPass(createSIAnnotateControlFlowPass());

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index b7804d75f1441..2ae152cae1576 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -77,6 +77,10 @@
 ; GCN-O0-NEXT:       Natural Loop Information
 ; GCN-O0-NEXT:       Legacy Divergence Analysis
 ; GCN-O0-NEXT:       AMDGPU IR late optimizations
+; GCN-O0-NEXT:       Basic Alias Analysis (stateless AA impl)
+; GCN-O0-NEXT:       Function Alias Analysis Results
+; GCN-O0-NEXT:       Code sinking
+; GCN-O0-NEXT:       Legacy Divergence Analysis
 ; GCN-O0-NEXT:       Unify divergent function exit nodes
 ; GCN-O0-NEXT:       Lazy Value Information Analysis
 ; GCN-O0-NEXT:       Lower SwitchInst's to branches
@@ -89,12 +93,10 @@
 ; GCN-O0-NEXT:       Detect single entry single exit regions
 ; GCN-O0-NEXT:       Region Pass Manager
 ; GCN-O0-NEXT:         Structurize control flow
-; GCN-O0-NEXT:       Basic Alias Analysis (stateless AA impl)
-; GCN-O0-NEXT:       Function Alias Analysis Results
-; GCN-O0-NEXT:       Natural Loop Information
-; GCN-O0-NEXT:       Code sinking
 ; GCN-O0-NEXT:       Post-Dominator Tree Construction
+; GCN-O0-NEXT:       Natural Loop Information
 ; GCN-O0-NEXT:       Legacy Divergence Analysis
+; GCN-O0-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O0-NEXT:       Function Alias Analysis Results
 ; GCN-O0-NEXT:       Memory SSA
 ; GCN-O0-NEXT:       AMDGPU Annotate Uniform Values
@@ -256,6 +258,10 @@
 ; GCN-O1-NEXT:       Natural Loop Information
 ; GCN-O1-NEXT:       Legacy Divergence Analysis
 ; GCN-O1-NEXT:       AMDGPU IR late optimizations
+; GCN-O1-NEXT:       Basic Alias Analysis (stateless AA impl)
+; GCN-O1-NEXT:       Function Alias Analysis Results
+; GCN-O1-NEXT:       Code sinking
+; GCN-O1-NEXT:       Legacy Divergence Analysis
 ; GCN-O1-NEXT:       Unify divergent function exit nodes
 ; GCN-O1-NEXT:       Lazy Value Information Analysis
 ; GCN-O1-NEXT:       Lower SwitchInst's to branches
@@ -268,12 +274,10 @@
 ; GCN-O1-NEXT:       Detect single entry single exit regions
 ; GCN-O1-NEXT:       Region Pass Manager
 ; GCN-O1-NEXT:         Structurize control flow
-; GCN-O1-NEXT:       Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT:       Function Alias Analysis Results
-; GCN-O1-NEXT:       Natural Loop Information
-; GCN-O1-NEXT:       Code sinking
 ; GCN-O1-NEXT:       Post-Dominator Tree Construction
+; GCN-O1-NEXT:       Natural Loop Information
 ; GCN-O1-NEXT:       Legacy Divergence Analysis
+; GCN-O1-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:       Function Alias Analysis Results
 ; GCN-O1-NEXT:       Memory SSA
 ; GCN-O1-NEXT:       AMDGPU Annotate Uniform Values
@@ -530,6 +534,10 @@
 ; GCN-O1-OPTS-NEXT:       Natural Loop Information
 ; GCN-O1-OPTS-NEXT:       Legacy Divergence Analysis
 ; GCN-O1-OPTS-NEXT:       AMDGPU IR late optimizations
+; GCN-O1-OPTS-NEXT:       Basic Alias Analysis (stateless AA impl)
+; GCN-O1-OPTS-NEXT:       Function Alias Analysis Results
+; GCN-O1-OPTS-NEXT:       Code sinking
+; GCN-O1-OPTS-NEXT:       Legacy Divergence Analysis
 ; GCN-O1-OPTS-NEXT:       Unify divergent function exit nodes
 ; GCN-O1-OPTS-NEXT:       Lazy Value Information Analysis
 ; GCN-O1-OPTS-NEXT:       Lower SwitchInst's to branches
@@ -542,12 +550,10 @@
 ; GCN-O1-OPTS-NEXT:       Detect single entry single exit regions
 ; GCN-O1-OPTS-NEXT:       Region Pass Manager
 ; GCN-O1-OPTS-NEXT:         Structurize control flow
-; GCN-O1-OPTS-NEXT:       Basic Alias Analysis (stateless AA impl)
-; GCN-O1-OPTS-NEXT:       Function Alias Analysis Results
-; GCN-O1-OPTS-NEXT:       Natural Loop Information
-; GCN-O1-OPTS-NEXT:       Code sinking
 ; GCN-O1-OPTS-NEXT:       Post-Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:       Natural Loop Information
 ; GCN-O1-OPTS-NEXT:       Legacy Divergence Analysis
+; GCN-O1-OPTS-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:       Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:       Memory SSA
 ; GCN-O1-OPTS-NEXT:       AMDGPU Annotate Uniform Values
@@ -812,6 +818,10 @@
 ; GCN-O2-NEXT:       Natural Loop Information
 ; GCN-O2-NEXT:       Legacy Divergence Analysis
 ; GCN-O2-NEXT:       AMDGPU IR late optimizations
+; GCN-O2-NEXT:       Basic Alias Analysis (stateless AA impl)
+; GCN-O2-NEXT:       Function Alias Analysis Results
+; GCN-O2-NEXT:       Code sinking
+; GCN-O2-NEXT:       Legacy Divergence Analysis
 ; GCN-O2-NEXT:       Unify divergent function exit nodes
 ; GCN-O2-NEXT:       Lazy Value Information Analysis
 ; GCN-O2-NEXT:       Lower SwitchInst's to branches
@@ -824,12 +834,10 @@
 ; GCN-O2-NEXT:       Detect single entry single exit regions
 ; GCN-O2-NEXT:       Region Pass Manager
 ; GCN-O2-NEXT:         Structurize control flow
-; GCN-O2-NEXT:       Basic Alias Analysis (stateless AA impl)
-; GCN-O2-NEXT:       Function Alias Analysis Results
-; GCN-O2-NEXT:       Natural Loop Information
-; GCN-O2-NEXT:       Code sinking
 ; GCN-O2-NEXT:       Post-Dominator Tree Construction
+; GCN-O2-NEXT:       Natural Loop Information
 ; GCN-O2-NEXT:       Legacy Divergence Analysis
+; GCN-O2-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:       Function Alias Analysis Results
 ; GCN-O2-NEXT:       Memory SSA
 ; GCN-O2-NEXT:       AMDGPU Annotate Uniform Values
@@ -1107,6 +1115,10 @@
 ; GCN-O3-NEXT:       Natural Loop Information
 ; GCN-O3-NEXT:       Legacy Divergence Analysis
 ; GCN-O3-NEXT:       AMDGPU IR late optimizations
+; GCN-O3-NEXT:       Basic Alias Analysis (stateless AA impl)
+; GCN-O3-NEXT:       Function Alias Analysis Results
+; GCN-O3-NEXT:       Code sinking
+; GCN-O3-NEXT:       Legacy Divergence Analysis
 ; GCN-O3-NEXT:       Unify divergent function exit nodes
 ; GCN-O3-NEXT:       Lazy Value Information Analysis
 ; GCN-O3-NEXT:       Lower SwitchInst's to branches
@@ -1119,12 +1131,10 @@
 ; GCN-O3-NEXT:       Detect single entry single exit regions
 ; GCN-O3-NEXT:       Region Pass Manager
 ; GCN-O3-NEXT:         Structurize control flow
-; GCN-O3-NEXT:       Basic Alias Analysis (stateless AA impl)
-; GCN-O3-NEXT:       Function Alias Analysis Results
-; GCN-O3-NEXT:       Natural Loop Information
-; GCN-O3-NEXT:       Code sinking
 ; GCN-O3-NEXT:       Post-Dominator Tree Construction
+; GCN-O3-NEXT:       Natural Loop Information
 ; GCN-O3-NEXT:       Legacy Divergence Analysis
+; GCN-O3-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:       Function Alias Analysis Results
 ; GCN-O3-NEXT:       Memory SSA
 ; GCN-O3-NEXT:       AMDGPU Annotate Uniform Values

diff  --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
index e60f0ddf7578a..9ff56e1d1f85f 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -6,11 +6,11 @@
 ; with exec.
 
 ; GCN-LABEL: {{^}}needs_and:
-; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}}
-; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]]
-; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]],
-; GCN: s_andn2_b64 exec, exec, [[REG3]]
 
+; GCN: s_or_b64 exec, exec, [[REG1:[^ ,]*]]
+; GCN: s_andn2_b64 exec, exec, [[REG2:[^ ,]*]]
+; GCN: s_or_b64 [[REG2:[^ ,]*]], [[REG1:[^ ,]*]], [[REG2:[^ ,]*]]
+; GCN: s_or_b64 exec, exec, [[REG2:[^ ,]*]]
 define void @needs_and(i32 %arg) {
 entry:
   br label %loop

diff  --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 0b3b1e57072d1..31897a7e13ab8 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -73,17 +73,16 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; GCN-NEXT:  BB0_4: ; %LOOP
 ; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
-; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v4
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v4
 ; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], exec
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GCN-NEXT:    s_cbranch_execz BB0_3
 ; GCN-NEXT:  ; %bb.5: ; %ENDIF
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
 ; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    s_and_b64 s[10:11], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]

diff  --git a/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll
new file mode 100644
index 0000000000000..e1273e1a4bcd0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; Test that image.sample instruction is sunk across the branch and not left in the first block. Since the kill may terminate the shader there might be no need for sampling the image.
+
+; GCN-LABEL: {{^}}sinking_img_sample:
+; GCN-NOT: image_sample
+; GCN: branch
+; GCN: image_sample
+; GCN: exp null
+
+define amdgpu_ps float @sinking_img_sample() {
+main_body:
+  %i = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
+  br i1 undef, label %endif1, label %if1
+
+if1:                                              ; preds = %main_body
+  call void @llvm.amdgcn.kill(i1 false) #4
+  br label %exit
+
+endif1:                                           ; preds = %main_body
+  %i22 = extractelement <3 x float> %i, i32 2
+  %i23 = call nsz arcp contract float @llvm.fma.f32(float %i22, float 0.000000e+00, float 0.000000e+00) #1
+  br label %exit
+
+exit:                                             ; preds = %endif1, %if1
+  %i24 = phi float [ undef, %if1 ], [ %i23, %endif1 ]
+  ret float %i24
+}
+; Function Attrs: nounwind readonly willreturn
+declare <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.fma.f32(float, float, float) #2
+
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.kill(i1) #4
+
+attributes #1 = { nounwind readnone }
+attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
+attributes #3 = { nounwind readonly willreturn }
+attributes #4 = { nounwind }