[llvm] e0e7371 - [AMDGPU] Keep skip branch for ds instructions
Sebastian Neubauer via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 5 03:56:08 PST 2021
Author: Sebastian Neubauer
Date: 2021-03-05T12:34:09+01:00
New Revision: e0e73714fbfeacad1a3adc0c6c8dbedcebc53ebc
URL: https://github.com/llvm/llvm-project/commit/e0e73714fbfeacad1a3adc0c6c8dbedcebc53ebc
DIFF: https://github.com/llvm/llvm-project/commit/e0e73714fbfeacad1a3adc0c6c8dbedcebc53ebc.diff
LOG: [AMDGPU] Keep skip branch for ds instructions
Same as other memory instructions, ds instructions add latency even if
exec is zero. Jumping over them if exec=0 is cheaper than executing
them.
With this change, the branch instruction that skips over a basic block
if exec=0 is not removed when the block contains a ds instruction.
Differential Revision: https://reviews.llvm.org/D97922
Added:
llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
Modified:
llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
llvm/test/CodeGen/AMDGPU/ret_jump.ll
llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
Removed:
llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
index d30ff4a3fd15..104dea8fdff5 100644
--- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
@@ -100,7 +100,7 @@ bool SIRemoveShortExecBranches::mustRetainExeczBranch(
// These instructions are potentially expensive even if EXEC = 0.
if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
- I->getOpcode() == AMDGPU::S_WAITCNT)
+ TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
return true;
++NumInstr;
diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
index 660ef84dd892..8b192f5f5ca3 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -14,11 +14,12 @@
; GCN-DAG: v_cmp_lt_f32_e32 vcc,
; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
+; GCN-NEXT: s_cbranch_execz BB0_{{[0-9]+}}
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %bb4
; GCN: ds_write_b32
-; GCN: ; %bb.{{[0-9]+}}:
+; GCN: BB0_{{[0-9]+}}: ; %UnifiedReturnBlock
; GCN-NEXT: s_endpgm
; GCN-NEXT: .Lfunc_end
define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
similarity index 71%
rename from llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir
rename to llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
index 3c078d847e3c..c0968b2437ba 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
@@ -56,3 +56,31 @@ body: |
bb.2:
S_ENDPGM 0
...
+
+---
+
+name: skip_execz_ds
+body: |
+ ; CHECK-LABEL: name: skip_execz_ds
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
+ ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK: bb.1:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: DS_WRITE_B32 $vgpr0, $vgpr0, 0, 0, implicit $m0, implicit $exec
+ ; CHECK: bb.2:
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1, %bb.2
+ SI_MASK_BRANCH %bb.2, implicit $exec
+
+ bb.1:
+ successors: %bb.2
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ DS_WRITE_B32 $vgpr0, $vgpr0, 0, 0, implicit $m0, implicit $exec
+
+ bb.2:
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
index 06d2c3be303e..8da4bd30fead 100644
--- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
@@ -65,6 +65,7 @@ ret.bb: ; preds = %else, %main_body
; GCN: BB{{[0-9]+_[0-9]+}}: ; %else
; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_cbranch_execz BB1_{{[0-9]+}}
; GCN-NEXT: ; %unreachable.bb
; GCN: ds_write_b32
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index dd34674c699c..11b4feb5cca2 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -3,12 +3,13 @@
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator:
; GCN: v_cmp_eq_u32
; GCN: s_and_saveexec_b64
+; GCN-NEXT: s_cbranch_execz BB0_{{[0-9]+}}
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable
; GCN: ds_write_b32
; GCN: ; divergent unreachable
-; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %UnifiedReturnBlock
+; GCN-NEXT: BB0_{{[0-9]+}}: ; %UnifiedReturnBlock
; GCN: s_endpgm
define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
@@ -28,12 +29,13 @@ ret:
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
; GCN: v_cmp_ne_u32
; GCN: s_and_saveexec_b64
+; GCN-NEXT: s_cbranch_execz BB1_{{[0-9]+}}
; GCN-NEXT: ; %bb.{{[0-9]+}}: ; %unreachable
; GCN: ds_write_b32
; GCN: ; divergent unreachable
-; GCN: ; %bb.{{[0-9]+}}:
+; GCN: BB1_{{[0-9]+}}:
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
bb:
More information about the llvm-commits
mailing list