[llvm] be3f459 - AMDGPU: Mark control flow intrinsics non-duplicable
Ruiling Song via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 5 23:34:42 PST 2023
Author: Ruiling Song
Date: 2023-02-06T15:32:44+08:00
New Revision: be3f4591aff07204bdc0f6eda0ea70de44c08af6
URL: https://github.com/llvm/llvm-project/commit/be3f4591aff07204bdc0f6eda0ea70de44c08af6
DIFF: https://github.com/llvm/llvm-project/commit/be3f4591aff07204bdc0f6eda0ea70de44c08af6.diff
LOG: AMDGPU: Mark control flow intrinsics non-duplicable
This is used to help get simplified CFG for divergent regions as well as
get better code generation in some cases.
For example, with below IR:
```
define amdgpu_kernel void @test() {
bb:
br label %bb1
bb1:
%tmp = phi i32 [ 0, %bb ], [ %tmp5, %bb4 ]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%cnd = icmp eq i32 %tid, 0
br i1 %cnd, label %bb4, label %bb2
bb2:
%tmp3 = add nsw i32 %tmp, 1
br label %bb4
bb4:
%tmp5 = phi i32 [ %tmp3, %bb2 ], [ %tmp, %bb1 ]
store volatile i32 %tmp5, ptr addrspace(1) undef
br label %bb1
}
```
We got below assembly before the change:
```
v_mov_b32_e32 v1, 0
v_cmp_eq_u32_e32 vcc, 0, v0
s_branch .LBB0_2
.LBB0_1: ; %bb4
; in Loop: Header=BB0_2 Depth=1
s_mov_b32 s2, -1
s_mov_b32 s3, 0xf000
buffer_store_dword v1, off, s[0:3], 0
s_waitcnt vmcnt(0)
.LBB0_2: ; %bb
; =>This Inner Loop Header: Depth=1
s_and_saveexec_b64 s[0:1], vcc
s_xor_b64 s[0:1], exec, s[0:1]
; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 killed $exec
s_cbranch_execnz .LBB0_1
; %bb.3: ; %bb2
; in Loop: Header=BB0_2 Depth=1
s_or_b64 exec, exec, s[0:1]
s_waitcnt expcnt(0)
v_add_i32_e64 v1, s[0:1], 1, v1
s_branch .LBB0_1
```
After the change:
```
s_mov_b32 s0, 0
v_cmp_eq_u32_e32 vcc, 0, v0
s_mov_b32 s2, -1
s_mov_b32 s3, 0xf000
v_mov_b32_e32 v0, s0
s_branch .LBB0_2
.LBB0_1: ; %bb4
; in Loop: Header=BB0_2 Depth=1
buffer_store_dword v0, off, s[0:3], 0
s_waitcnt vmcnt(0)
.LBB0_2: ; %bb1
; =>This Inner Loop Header: Depth=1
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execnz .LBB0_1
; %bb.3: ; %bb2
; in Loop: Header=BB0_2 Depth=1
s_or_b64 exec, exec, s[0:1]
s_waitcnt expcnt(0)
v_add_i32_e64 v0, s[0:1], 1, v0
s_branch .LBB0_1
```
We are using one less VGPR, one less s_xor_, and better LICM with one
additional branch after the change. Please note the experiment
was done with reverting the workaround D139780, as it will stop the
tail-duplication completely for this case.
Reviewed by: arsenm
Differential Revision: https://reviews.llvm.org/D118250
Added:
llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir
Modified:
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 19f2f27120f6b..2c10cdc7f5af5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -368,7 +368,13 @@ def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask),
// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
-let isTerminator = 1 in {
+// As we have enhanced control flow intrinsics to work under unstructured CFG,
+// duplicating such intrinsics can be actually treated as legal. On the contrary,
+// by making them non-duplicable, we are observing better code generation result.
+// So we choose to mark them non-duplicable in hope of getting better code
+// generation as well as simplied CFG during Machine IR optimization stage.
+
+let isTerminator = 1, isNotDuplicable = 1 in {
let OtherPredicates = [EnableLateCFGStructurize] in {
def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
@@ -418,6 +424,7 @@ def SI_END_CF : CFPseudoInstSI <
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
let hasSideEffects = 1;
+ let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details.
let mayLoad = 1; // FIXME: Should not need memory flags
let mayStore = 1;
}
@@ -425,6 +432,7 @@ def SI_END_CF : CFPseudoInstSI <
def SI_IF_BREAK : CFPseudoInstSI <
(outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
let Size = 4;
+ let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details.
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
index 7666c8f9a07dc..a21cf8290f5a2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
@@ -18,12 +18,12 @@ define amdgpu_ps void @main(i32 %arg) {
; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s2, s0, s2
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_execz .LBB0_5
; GFX10-NEXT: .LBB0_2: ; %bb4
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo
-; GFX10-NEXT: s_or_b32 s2, s0, s2
; GFX10-NEXT: s_and_saveexec_b32 s3, s1
; GFX10-NEXT: s_cbranch_execz .LBB0_1
; GFX10-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir
new file mode 100644
index 0000000000000..c23c8900096fb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=early-tailduplication -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: stop_duplicate_cfg_intrinsic
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: stop_duplicate_cfg_intrinsic
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.3
+ ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[PHI]], [[COPY2]], 0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_CO_U32_e64_]], %bb.2, [[PHI]], %bb.1
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+ ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET [[PHI1]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ bb.1:
+ liveins: $vgpr0
+
+ %0:vgpr_32 = COPY $vgpr0
+ %12:sreg_64 = IMPLICIT_DEF
+ %4:sreg_32 = S_MOV_B32 0
+ %14:vgpr_32 = COPY %4:sreg_32
+ %5:sreg_64_xexec = V_CMP_EQ_U32_e64 %0:vgpr_32, %14:vgpr_32, implicit $exec
+
+ bb.2:
+ %6:vgpr_32 = PHI %4:sreg_32, %bb.1, %11:vgpr_32, %bb.4
+ %8:sreg_64_xexec = SI_IF %5:sreg_64_xexec, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.3:
+ SI_END_CF %8:sreg_64_xexec, implicit-def $exec, implicit-def $scc, implicit $exec
+ %13:sreg_32 = S_MOV_B32 1
+ %15:vgpr_32 = COPY %13:sreg_32
+ %10:vgpr_32, dead %20:sreg_64_xexec = V_ADD_CO_U32_e64 %6:vgpr_32, %15:vgpr_32, 0, implicit $exec
+
+ bb.4:
+ %11:vgpr_32 = PHI %10:vgpr_32, %bb.3, %6:vgpr_32, %bb.2
+ %16:sreg_32 = S_MOV_B32 4294967295
+ %17:sreg_32 = S_MOV_B32 61440
+ %18:sreg_64 = REG_SEQUENCE %16:sreg_32, %subreg.sub0, %17:sreg_32, %subreg.sub1
+ %19:sgpr_128 = REG_SEQUENCE %12:sreg_64, %subreg.sub0_sub1, %18:sreg_64, %subreg.sub2_sub3
+ BUFFER_STORE_DWORD_OFFSET %11:vgpr_32, %19:sgpr_128, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.2
+
+...
More information about the llvm-commits
mailing list