[llvm] [AMDGPU][InsertWaitCnts] Optimize loadcnt insertion at function boundaries (PR #169647)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 26 04:48:50 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Pankaj Dwivedi (PankajDwivedi-25)
<details>
<summary>Changes</summary>
On GFX12+, GLOBAL_INV increments the loadcnt counter but does not write results to any VGPRs. Previously, we unconditionally inserted s_wait_loadcnt 0 at function returns even when the only pending loadcnt was from GLOBAL_INV instructions.
This patch optimizes waitcnt insertion by skipping the loadcnt wait at function boundaries when no VGPRs have pending loads. This is determined by checking if any VGPR has a score greater than the lower bound for LOAD_CNT - if not, the pending loadcnt must be from non-VGPR-writing instructions like GLOBAL_INV.
The optimization is limited to GFX12+ targets where GLOBAL_INV exists and uses the extended wait count instructions.
This is a follow-up optimization to PR #<!-- -->135340 which added tracking for GLOBAL_INV in the waitcnt pass.
---
Patch is 592.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169647.diff
32 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+26-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (-16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll (-16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+1-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll (-10)
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (-4)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (-41)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (-32)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (-32)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (-100)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (-90)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (-90)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (-86)
- (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll (-47)
- (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll (-4)
- (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll (-4)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+1-6)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (-224)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (-90)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (-90)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (-86)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll (-16)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll (-47)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (-4)
- (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll (+6-12)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll (+6-12)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (-42)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (-46)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (-46)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (-46)
- (added) llvm/test/CodeGen/AMDGPU/waitcnt-func-global-inv.mir (+115)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 70db7b4918515..b2b562a0f8269 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -715,6 +715,22 @@ class WaitcntBrackets {
PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
}
+ // Returns true if any VGPR has a pending load (score > lower bound for T).
+ // This is used to optimize waitcnt insertion at function boundaries when the
+ // only pending LOAD_CNT events are from instructions that don't write to
+ // VGPRs (e.g., GLOBAL_INV).
+ bool hasPendingVGPRWait(InstCounterType T) const {
+ unsigned LB = getScoreLB(T);
+ // If VgprUB is -1, no VGPRs have been touched
+ if (VgprUB < 0)
+ return false;
+ for (int RegNo = 0; RegNo <= VgprUB; ++RegNo) {
+ if (VgprScores[T][RegNo] > LB)
+ return true;
+ }
+ return false;
+ }
+
ArrayRef<const MachineInstr *> getLDSDMAStores() const {
return LDSDMAStores;
}
@@ -1904,7 +1920,16 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
Opc == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
- Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
+ AMDGPU::Waitcnt AllZeroWait =
+ WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
+ // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
+ // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
+ // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
+ // no need to wait for it at function boundaries.
+ if (ST->hasExtendedWaitCounts() &&
+ !ScoreBrackets.hasPendingVGPRWait(LOAD_CNT))
+ AllZeroWait.LoadCnt = ~0u;
+ Wait = Wait.combined(AllZeroWait);
}
// In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
// Technically the hardware will do this on its own if we don't, but that
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 8063b29c29985..32f539a267e67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -22,7 +22,6 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_ret_f32:
@@ -95,7 +94,6 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_noret_f32:
@@ -168,7 +166,6 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_ret_f64:
@@ -245,7 +242,6 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_noret_f64:
@@ -322,7 +318,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -469,7 +464,6 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -630,7 +624,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -786,7 +779,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -917,7 +909,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1060,7 +1051,6 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1220,7 +1210,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1374,7 +1363,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1507,7 +1495,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1664,7 +1651,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1838,7 +1824,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2005,7 +1990,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 5b0b602bd99ba..be0ef85b217de 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -22,7 +22,6 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_ret_f32:
@@ -95,7 +94,6 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_noret_f32:
@@ -168,7 +166,6 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_ret_f64:
@@ -245,7 +242,6 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_noret_f64:
@@ -322,7 +318,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -469,7 +464,6 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -630,7 +624,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -786,7 +779,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -917,7 +909,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1060,7 +1051,6 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1220,7 +1210,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1374,7 +1363,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1507,7 +1495,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1664,7 +1651,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1838,7 +1824,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2005,7 +1990,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 58586129fb4e9..1d0b423c1e0c5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1803,7 +1803,6 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1841,7 +1840,6 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1881,7 +1879,6 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2127,7 +2124,6 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
@@ -2165,7 +2161,6 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
@@ -2207,7 +2202,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index b75eb737534e9..5c80c27c3d280 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -1232,7 +1232,6 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1280,7 +1279,6 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1322,7 +1320,6 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/169647
More information about the llvm-commits
mailing list