[llvm] [AMDGPU] Fix broken XCNT Test cases (PR #160812)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 2 22:11:42 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Aaditya (easyonaadit)
<details>
<summary>Changes</summary>
Hardware does an implicit `S_WAIT_XCNT 0` between
alternate SMEM and VMEM instructions, so there are
never outstanding address translations for both SMEM
and VMEM at the same time.
---
Full diff: https://github.com/llvm/llvm-project/pull/160812.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+11)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/wait-xcnt.mir (+3-6)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ae75fb529dade..ebf059a7bff2d 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1004,6 +1004,15 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
}
} else if (T == X_CNT) {
+ WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
+ if (PendingEvents & (1 << OtherEvent)) {
+ // Hardware inserts an implicit xcnt between interleaved
+ // SMEM and VMEM operations. So there will never be
+ // outstanding address translations for both SMEM and
+ // VMEM at the same time.
+ setScoreLB(T, CurrScore - 1);
+ PendingEvents &= ~(1 << OtherEvent);
+ }
for (const MachineOperand &Op : Inst.all_uses())
setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore);
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
@@ -2257,6 +2266,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// Now look at the instruction opcode. If it is a memory access
// instruction, update the upper-bound of the appropriate counter's
// bracket and the destination operand scores.
+ // For architectures with X_CNT, mark the source address operands
+ // with the appropriate counter values.
// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
bool IsVMEMAccess = false;
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 243f0ed3a8d0d..f8655a702180e 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -256,7 +256,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: .LBB5_3: ; %bb4
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index af8b9e7c8cd82..6fe99d895c7bb 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -520,6 +520,7 @@ body: |
; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
$sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
@@ -921,7 +922,6 @@ body: |
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
...
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
---
name: wait_kmcnt_with_outstanding_vmem
tracksRegLiveness: true
@@ -937,6 +937,7 @@ body: |
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
@@ -944,7 +945,6 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...
-# FIXME: Missing S_WAIT_XCNT before overwriting sgpr0.
---
name: wait_loadcnt_with_outstanding_smem
tracksRegLiveness: true
@@ -960,6 +960,7 @@ body: |
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: S_WAIT_LOADCNT 0
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
@@ -967,7 +968,6 @@ body: |
$sgpr0 = S_MOV_B32 0
...
-# TODO: Unnecessary wait before overwriting vgpr0.
---
name: overwrite_vgpr_after_smem
tracksRegLiveness: true
@@ -981,14 +981,12 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
- ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...
-# TODO: Unnecessary wait before overwriting sgpr0.
---
name: overwrite_sgpr_after_vmem
tracksRegLiveness: true
@@ -1002,7 +1000,6 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
- ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
``````````
</details>
https://github.com/llvm/llvm-project/pull/160812
More information about the llvm-commits
mailing list