[llvm] remove redundant waitcnts (PR #75785)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 18 03:26:04 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
- [AMDGPU] Add new test case for #<!-- -->72830
- [AMDGPU] Promote all soft waitcnts at the end of SIInsertWitcnts
- [AMDGPU] Do not promote soft waitcnts during SIInsertWaitcnts
---
Full diff: https://github.com/llvm/llvm-project/pull/75785.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+16-18)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (-3)
- (modified) llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll (+2-2)
- (added) llvm/test/CodeGen/AMDGPU/waitcnt-waterfall.mir (+134)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8415a3d77d3bcd..efc38b63647b11 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -492,9 +492,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
MachineInstr &OldWaitcntInstr,
AMDGPU::Waitcnt &Wait,
MachineBasicBlock::instr_iterator It) const;
-
- // Transform a soft waitcnt into a normal one.
- bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
};
} // end anonymous namespace
@@ -874,15 +871,6 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
return true;
}
-bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
- unsigned Opcode = Waitcnt->getOpcode();
- if (!SIInstrInfo::isSoftWaitcnt(Opcode))
- return false;
-
- Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode)));
- return true;
-}
-
/// Combine consecutive waitcnt instructions that precede \p It and follow
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
/// by previous passes. Currently this pass conservatively assumes that these
@@ -940,7 +928,6 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
if (WaitcntInstr) {
Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
AMDGPU::encodeWaitcnt(IV, Wait));
- Modified |= promoteSoftWaitCnt(WaitcntInstr);
ScoreBrackets.applyWaitcnt(Wait);
Wait.VmCnt = ~0u;
@@ -959,7 +946,6 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
if (WaitcntVsCntInstr) {
Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
AMDGPU::OpName::simm16, Wait.VsCnt);
- Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
ScoreBrackets.applyWaitcnt(Wait);
Wait.VsCnt = ~0u;
@@ -1320,7 +1306,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
if (Wait.hasWaitExceptVsCnt()) {
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
[[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_soft)).addImm(Enc);
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1331,9 +1317,10 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
if (Wait.hasWaitVsCnt()) {
assert(ST->hasVscnt());
- [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(Wait.VsCnt);
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.VsCnt);
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1935,6 +1922,17 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
} while (Repeat);
+ // Promote all soft waitcnts.
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB.instrs()) {
+ if (SIInstrInfo::isSoftWaitcnt(MI.getOpcode())) {
+ MI.setDesc(
+ TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(MI.getOpcode())));
+ Modified = true;
+ }
+ }
+ }
+
if (ST->hasScalarStores()) {
SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
bool HaveScalarStores = false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 70ef1fff274a40..a91780ce89762f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8784,9 +8784,6 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
}
int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
- if (SIInstrInfo::isSoftWaitcnt(Opcode))
- Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
-
unsigned Gen = subtargetEncodingFamily(ST);
if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index 5b770248520562..caf67da9cd2d97 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -26,7 +26,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float
; GCN: successors:
; GCN: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GCN: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN: S_WAITCNT_soft 3952
+ ; GCN: S_WAITCNT 3952
; GCN: bb.3:
entry:
%cc = icmp sgt i32 %a, 0
@@ -63,7 +63,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a,
; GCN: successors:
; GCN: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GCN: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN: S_WAITCNT_soft 3952
+ ; GCN: S_WAITCNT 3952
; GCN: bb.5:
entry:
%cc = icmp sgt i32 %a, 0
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-waterfall.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-waterfall.mir
new file mode 100644
index 00000000000000..c32161dd1246cd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-waterfall.mir
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -march=amdgcn -start-before=si-insert-waitcnts -mcpu=gfx1030 -verify-machineinstrs -o - %s | FileCheck %s
+
+--- |
+ define amdgpu_ps <4 x float> @test_waterfall_multi_begin(ptr addrspace(4) inreg %in, ptr addrspace(4) inreg %s_in, i32 %idx1, i32 %idx2, i32 %s_idx, i32 %s_idx2) #0 {
+ ; CHECK-LABEL: test_waterfall_multi_begin:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; CHECK-NEXT: s_mov_b32 s4, exec_lo
+ ; CHECK-NEXT: s_wqm_b32 exec_lo, exec_lo
+ ; CHECK-NEXT: v_mov_b32_e32 v7, v2
+ ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+ ; CHECK-NEXT: v_mov_b32_e32 v6, v1
+ ; CHECK-NEXT: v_mov_b32_e32 v5, v0
+ ; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+ ; CHECK-NEXT: v_lshlrev_b64 v[2:3], 4, v[3:4]
+ ; CHECK-NEXT: v_lshlrev_b64 v[0:1], 5, v[7:8]
+ ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+ ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+ ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, s2, v2
+ ; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
+ ; CHECK-NEXT: s_clause 0x1
+ ; CHECK-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16
+ ; CHECK-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
+ ; CHECK-NEXT: global_load_dwordx4 v[15:18], v[2:3], off
+ ; CHECK-NEXT: s_mov_b32 s0, exec_lo
+ ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+ ; CHECK-NEXT: v_readfirstlane_b32 s1, v5
+ ; CHECK-NEXT: v_readfirstlane_b32 s2, v6
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s1, s1, v5
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s2, s2, v6
+ ; CHECK-NEXT: s_and_b32 s1, s1, s2
+ ; CHECK-NEXT: s_and_saveexec_b32 s1, s1
+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
+ ; CHECK-NEXT: v_mov_b32_e32 v0, 0
+ ; CHECK-NEXT: v_readfirstlane_b32 s8, v7
+ ; CHECK-NEXT: v_readfirstlane_b32 s9, v8
+ ; CHECK-NEXT: v_readfirstlane_b32 s10, v9
+ ; CHECK-NEXT: v_readfirstlane_b32 s11, v10
+ ; CHECK-NEXT: v_readfirstlane_b32 s12, v11
+ ; CHECK-NEXT: v_readfirstlane_b32 s13, v12
+ ; CHECK-NEXT: v_readfirstlane_b32 s14, v13
+ ; CHECK-NEXT: v_readfirstlane_b32 s15, v14
+ ; CHECK-NEXT: v_readfirstlane_b32 s16, v15
+ ; CHECK-NEXT: v_readfirstlane_b32 s17, v16
+ ; CHECK-NEXT: v_readfirstlane_b32 s18, v17
+ ; CHECK-NEXT: v_readfirstlane_b32 s19, v18
+ ; CHECK-NEXT: v_mov_b32_e32 v1, v0
+ ; CHECK-NEXT: ; implicit-def: $vgpr5
+ ; CHECK-NEXT: ; implicit-def: $vgpr6
+ ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14
+ ; CHECK-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18
+ ; CHECK-NEXT: image_sample v[0:3], v[0:1], s[8:15], s[16:19] dmask:0xf dim:SQ_RSRC_IMG_2D
+ ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+ ; CHECK-NEXT: s_cbranch_execnz .LBB0_1
+ ; CHECK-NEXT: ; %bb.2:
+ ; CHECK-NEXT: s_mov_b32 exec_lo, s0
+ ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, s4
+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
+ ; CHECK-NEXT: ; return to shader part epilog
+ ret <4 x float> poison
+ }
+
+ attributes #0 = { nounwind "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" "target-cpu"="gfx1030" "uniform-work-group-size"="false" }
+...
+---
+name: test_waterfall_multi_begin
+tracksRegLiveness: true
+machineFunctionInfo:
+ returnsVoid: false
+ psInputAddr: 15
+ psInputEnable: 15
+body: |
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ $sgpr4 = S_MOV_B32 $exec_lo
+ $exec_lo = S_WQM_B32 $exec_lo, implicit-def $scc
+ $vgpr7 = V_MOV_B32_e32 killed $vgpr2, implicit $exec, implicit $exec
+ renamable $vgpr4 = V_ASHRREV_I32_e32 31, $vgpr3, implicit $exec
+ $vgpr6 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $vgpr0, implicit $exec, implicit $exec
+ renamable $vgpr8 = V_ASHRREV_I32_e32 31, $vgpr7, implicit $exec
+ renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 4, killed $vgpr3_vgpr4, implicit $exec
+ renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 5, killed $vgpr7_vgpr8, implicit $exec
+ renamable $vgpr0, renamable $vcc_lo = V_ADD_CO_U32_e64 killed $sgpr0, killed $vgpr0, 0, implicit $exec
+ renamable $vgpr1 = V_ADDC_U32_e32 killed $sgpr1, killed $vgpr1, implicit-def dead $vcc, implicit killed $vcc, implicit $exec
+ renamable $vgpr2, renamable $vcc_lo = V_ADD_CO_U32_e64 killed $sgpr2, killed $vgpr2, 0, implicit $exec
+ renamable $vgpr3 = V_ADDC_U32_e32 killed $sgpr3, killed $vgpr3, implicit-def dead $vcc, implicit killed $vcc, implicit $exec
+ renamable $vgpr11_vgpr12_vgpr13_vgpr14 = GLOBAL_LOAD_DWORDX4 renamable $vgpr0_vgpr1, 16, 0, implicit $exec
+ renamable $vgpr7_vgpr8_vgpr9_vgpr10 = GLOBAL_LOAD_DWORDX4 killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ renamable $vgpr15_vgpr16_vgpr17_vgpr18 = GLOBAL_LOAD_DWORDX4 killed renamable $vgpr2_vgpr3, 0, 0, implicit $exec
+ renamable $sgpr0 = S_MOV_B32 $exec_lo
+
+ bb.1:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0, $sgpr4, $vgpr5, $vgpr6, $vgpr15_vgpr16_vgpr17_vgpr18:0x00000000000000FF, $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14:0x000000000000FFFF
+
+ renamable $sgpr1 = V_READFIRSTLANE_B32 $vgpr5, implicit $exec
+ renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr6, implicit $exec
+ renamable $sgpr1 = V_CMP_EQ_U32_e64 killed $sgpr1, killed $vgpr5, implicit $exec
+ renamable $sgpr2 = V_CMP_EQ_U32_e64 killed $sgpr2, killed $vgpr6, implicit $exec
+ renamable $sgpr1 = S_AND_B32 killed renamable $sgpr1, killed renamable $sgpr2, implicit-def dead $scc
+ renamable $sgpr1 = S_AND_SAVEEXEC_B32 killed renamable $sgpr1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr8 = V_READFIRSTLANE_B32 killed $vgpr7, implicit $exec
+ renamable $sgpr9 = V_READFIRSTLANE_B32 killed $vgpr8, implicit $exec
+ renamable $sgpr10 = V_READFIRSTLANE_B32 killed $vgpr9, implicit $exec
+ renamable $sgpr11 = V_READFIRSTLANE_B32 killed $vgpr10, implicit $exec
+ renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec
+ renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr12, implicit $exec
+ renamable $sgpr14 = V_READFIRSTLANE_B32 killed $vgpr13, implicit $exec
+ renamable $sgpr15 = V_READFIRSTLANE_B32 killed $vgpr14, implicit $exec
+ renamable $sgpr16 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec
+ renamable $sgpr17 = V_READFIRSTLANE_B32 killed $vgpr16, implicit $exec
+ renamable $sgpr18 = V_READFIRSTLANE_B32 killed $vgpr17, implicit $exec
+ renamable $sgpr19 = V_READFIRSTLANE_B32 killed $vgpr18, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
+ renamable $vgpr5 = IMPLICIT_DEF
+ renamable $vgpr6 = IMPLICIT_DEF
+ renamable $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 = IMPLICIT_DEF
+ renamable $vgpr15_vgpr16_vgpr17_vgpr18 = IMPLICIT_DEF
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2_gfx10 killed renamable $vgpr0_vgpr1, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ $exec_lo = S_XOR_B32 $exec_lo, killed renamable $sgpr1, implicit-def dead $scc
+ S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+ bb.2:
+ liveins: $sgpr0, $sgpr4, $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF
+
+ $exec_lo = S_MOV_B32 killed renamable $sgpr0
+ $exec_lo = S_AND_B32 $exec_lo, killed renamable $sgpr4, implicit-def $scc
+ SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/75785
More information about the llvm-commits
mailing list