[PATCH] D101115: [AMDGPU] Move code sinking before structurizer

Thu Apr 22 15:23:49 PDT 2021

piotr created this revision.
Herald added subscribers: kerbowa, hiraditya, t-tye, tpr, dstuttard, yaxunl, nhaehnle, jvesely, kzhuravl, arsenm.
piotr requested review of this revision.
Herald added subscribers: llvm-commits, wdng.
Herald added a project: LLVM.

Moving code sinking pass before structurizer creates more sinking
opportunities.

The extra flow edges introduced by the structurizer can have adverse
effects on sinking, because the sinking pass prefers moving instructions
to blocks with unique predecessors and the structurizer destroys that
property in some cases.

A notable example is moving high-latency image instructions across kills.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D101115

Files:
  llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
  llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
  llvm/test/CodeGen/AMDGPU/multilevel-break.ll


Index: llvm/test/CodeGen/AMDGPU/multilevel-break.ll
===================================================================

--- llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -73,17 +73,16 @@
 ; GCN-NEXT:  BB0_4: ; %LOOP
 ; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
-; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v1
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v4
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v4
 ; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], exec
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GCN-NEXT:    s_cbranch_execz BB0_3
 ; GCN-NEXT:  ; %bb.5: ; %ENDIF
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
 ; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    s_and_b64 s[10:11], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
@@ -196,7 +195,7 @@
 ; GCN-NEXT:    s_cbranch_execz BB1_9
 ; GCN-NEXT:  BB1_2: ; %bb1
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; GCN-NEXT:    s_mov_b64 s[6:7], -1
@@ -213,7 +212,7 @@
 ; GCN-NEXT:    s_cbranch_vccz BB1_5
 ; GCN-NEXT:  ; %bb.4: ; %case1
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v2
 ; GCN-NEXT:    s_mov_b64 s[8:9], 0
@@ -233,7 +232,7 @@
 ; GCN-NEXT:    s_cbranch_vccz BB1_1
 ; GCN-NEXT:  ; %bb.8: ; %case0
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
Index: llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -6,11 +6,11 @@
 ; with exec.
 
 ; GCN-LABEL: {{^}}needs_and:
-; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}}
-; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]]
-; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]],
-; GCN: s_andn2_b64 exec, exec, [[REG3]]
 
+; GCN: s_or_b64 exec, exec, [[REG1:[^ ,]*]]
+; GCN: s_andn2_b64 exec, exec, [[REG2:[^ ,]*]]
+; GCN: s_or_b64 [[REG2:[^ ,]*]], [[REG1:[^ ,]*]], [[REG2:[^ ,]*]]
+; GCN: s_or_b64 exec, exec, [[REG2:[^ ,]*]]
 define void @needs_and(i32 %arg) {
 entry:
   br label %loop
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1039,6 +1039,7 @@
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
 
+  addPass(createSinkingPass());
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
   addPass(&AMDGPUUnifyDivergentExitNodesID);
@@ -1049,7 +1050,6 @@
     }
     addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
   }
-  addPass(createSinkingPass());
   addPass(createAMDGPUAnnotateUniformValues());
   if (!LateCFGStructurize) {
     addPass(createSIAnnotateControlFlowPass());


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D101115.339794.patch
Type: text/x-patch
Size: 3759 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210422/9e335914/attachment.bin>