[llvm] 4676242 - AMDGPU/GFX12: Do not wait unnecessarily before barriers (#154970)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Aug 23 00:08:02 PDT 2025
Author: Nicolai Hähnle
Date: 2025-08-23T00:07:59-07:00
New Revision: 46762421c30a361c439ad5930f1fd026601db7f5
URL: https://github.com/llvm/llvm-project/commit/46762421c30a361c439ad5930f1fd026601db7f5
DIFF: https://github.com/llvm/llvm-project/commit/46762421c30a361c439ad5930f1fd026601db7f5.diff
LOG: AMDGPU/GFX12: Do not wait unnecessarily before barriers (#154970)
The barrier intrinsic itself should not have memory semantics. Frontends
should use appropriate fence instructions for memory effects, and some
frontends want to rely on that for performance (e.g. wait only for LDS
before a barrier).
See the code comment for more detail.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
llvm/test/CodeGen/AMDGPU/s-barrier.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 4ebb0f746f464..e3a2efdd3856f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2014,11 +2014,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
}
}
- // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
- // not, we need to ensure the subtarget is capable of backing off barrier
- // instructions in case there are any outstanding memory operations that may
- // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
- if (TII->isBarrierStart(MI.getOpcode()) &&
+ // Ensure safety against exceptions from outstanding memory operations while
+ // waiting for a barrier:
+ //
+ // * Some subtargets safely handle backing off the barrier in hardware
+ // when an exception occurs.
+ // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
+ // there can be no outstanding memory operations during the wait.
+ // * Subtargets with split barriers don't need to back off the barrier; it
+ // is up to the trap handler to preserve the user barrier state correctly.
+ //
+ // In all other cases, ensure safety by ensuring that there are no outstanding
+ // memory operations.
+ if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index f7c7bb509c9ef..fdbd9ce4a66bf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -983,19 +983,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
}
- // Check to see if opcode is for a barrier start. Pre gfx12 this is just the
- // S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want
- // to check for the barrier start (S_BARRIER_SIGNAL*)
- bool isBarrierStart(unsigned Opcode) const {
+ bool isBarrier(unsigned Opcode) const {
return Opcode == AMDGPU::S_BARRIER ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
- Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
- }
-
- bool isBarrier(unsigned Opcode) const {
- return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
+ Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
+ Opcode == AMDGPU::S_BARRIER_WAIT ||
Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 90e150c89955b..9003251253740 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -98,7 +98,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT4-NEXT: s_wait_kmcnt 0x0
; VARIANT4-NEXT: v_xad_u32 v0, v2, -1, s2
; VARIANT4-NEXT: global_store_b32 v3, v2, s[0:1]
-; VARIANT4-NEXT: s_wait_storecnt 0x0
; VARIANT4-NEXT: s_barrier_signal -1
; VARIANT4-NEXT: s_barrier_wait -1
; VARIANT4-NEXT: v_ashrrev_i32_e32 v1, 31, v0
@@ -145,7 +144,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4
; VARIANT6-NEXT: global_store_b32 v5, v4, s[0:1]
; VARIANT6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VARIANT6-NEXT: s_wait_storecnt 0x0
; VARIANT6-NEXT: s_barrier_signal -1
; VARIANT6-NEXT: s_barrier_wait -1
; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
index 651d204f65b6c..248e0c716b975 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
@@ -11,7 +11,6 @@ define i1 @func1() {
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
@@ -27,7 +26,6 @@ define i1 @func1() {
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index 1821bd45dc1cc..a4fa8e4b3c8e2 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -14,11 +14,10 @@ define void @func1() {
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 3
; GFX12-SDAG-NEXT: s_barrier_join m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003
+; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_barrier_wait 1
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -30,13 +29,12 @@ define void @func1() {
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70003
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_join 3
+; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -49,11 +47,10 @@ define void @func2() {
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
; GFX12-SDAG-NEXT: s_barrier_join m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001
+; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_barrier_wait 1
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -65,13 +62,12 @@ define void @func2() {
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70001
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_join 1
+; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -102,9 +98,9 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
; GFX12-SDAG-NEXT: s_barrier_signal -1
-; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_barrier_join m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
+; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_barrier_wait 1
; GFX12-SDAG-NEXT: s_barrier_leave
; GFX12-SDAG-NEXT: s_get_barrier_state s3, m0
@@ -155,11 +151,11 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, s0
; GFX12-GISEL-NEXT: s_barrier_signal -1
+; GFX12-GISEL-NEXT: s_barrier_join m0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
-; GFX12-GISEL-NEXT: s_mov_b32 m0, s0
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
-; GFX12-GISEL-NEXT: s_barrier_join m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_barrier_leave
; GFX12-GISEL-NEXT: s_get_barrier_state s0, 2
@@ -194,8 +190,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
call void @llvm.amdgcn.s.barrier.signal(i32 -1)
- %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @llvm.amdgcn.s.barrier.leave(i16 1)
%state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
@@ -219,7 +215,6 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002
; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_barrier_signal m0
; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
@@ -227,6 +222,7 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
; GFX12-SDAG-NEXT: s_barrier_join m0
; GFX12-SDAG-NEXT: s_barrier_wait 1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13]
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -245,10 +241,10 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_join 2
; GFX12-GISEL-NEXT: s_barrier_wait 1
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13]
; GFX12-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
More information about the llvm-commits
mailing list