[llvm] [AMDGPU] SIWholeQuadMode: Ensure earliest WQM entry point for PS (PR #123266)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 16 17:40:59 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Carl Ritson (perlfu)
<details>
<summary>Changes</summary>
Ensure shaders running WQM (PS) enter at the earliest point irrespective of WQM marking.
---
Full diff: https://github.com/llvm/llvm-project/pull/123266.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+8-1)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll (+9-2)
- (modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+4-4)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 263f6497b9a7a8..466bc3861c1573 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1305,7 +1305,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
// Record initial state is block information.
BI.InitialState = State;
- for (;;) {
+ for (unsigned Idx = 0; ; ++Idx) {
MachineBasicBlock::iterator Next = II;
char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
char OutNeeds = 0;
@@ -1316,6 +1316,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
if (FirstStrict == IE)
FirstStrict = II;
+ // Adjust needs if this is first instruction of WQM requiring shader.
+ if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
+ Needs = StateWQM;
+
// First, figure out the allowed states (Needs) based on the propagated
// flags.
if (II != IE) {
@@ -1801,6 +1805,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
lowerKillInstrs(true);
Changed = true;
} else {
+ // Mark entry for WQM if required.
+ if (GlobalFlags & StateWQM)
+ Blocks[&Entry].InNeeds |= StateWQM;
// Wave mode switching requires full lowering pass.
for (auto BII : Blocks)
processBlock(*BII.first, BII.first == &Entry);
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index 451f64f71282b2..f88d3f1261f0b8 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -5,12 +5,16 @@
define amdgpu_ps float @_amdgpu_ps_main() #0 {
; GFX10-LABEL: _amdgpu_ps_main:
; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_mov_b32 s0, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_clause 0x3
@@ -70,12 +74,15 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
;
; GFX11-LABEL: _amdgpu_ps_main:
; GFX11: ; %bb.0: ; %.entry
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_clause 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 14395648de3db7..deab4075818805 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1842,10 +1842,10 @@ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-W64-LABEL: test_kill_1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
-; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v5, v0
; GFX9-W64-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1
+; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
@@ -1866,10 +1866,10 @@ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-W32-LABEL: test_kill_1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
-; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v5, v0
; GFX10-W32-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
@@ -2174,8 +2174,8 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
; GFX9-W64-LABEL: test_scc:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
+; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB39_2
; GFX9-W64-NEXT: ; %bb.1: ; %else
@@ -2199,9 +2199,9 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
;
; GFX10-W32-LABEL: test_scc:
; GFX10-W32: ; %bb.0: ; %main_body
-; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB39_2
; GFX10-W32-NEXT: ; %bb.1: ; %else
``````````
</details>
https://github.com/llvm/llvm-project/pull/123266
More information about the llvm-commits
mailing list