[llvm] 3bee9ba - AMDGPU/GFX12: Fix s_barrier_signal_isfirst for single-wave workgroups (#143634)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 19 11:22:52 PDT 2025
Author: Nicolai Hähnle
Date: 2025-06-19T11:22:49-07:00
New Revision: 3bee9ba0156ee130fa88379a5a89de0812936a3d
URL: https://github.com/llvm/llvm-project/commit/3bee9ba0156ee130fa88379a5a89de0812936a3d
DIFF: https://github.com/llvm/llvm-project/commit/3bee9ba0156ee130fa88379a5a89de0812936a3d.diff
LOG: AMDGPU/GFX12: Fix s_barrier_signal_isfirst for single-wave workgroups (#143634)
Barrier instructions are no-ops in single-wave workgroups. This includes
s_barrier_signal_isfirst, which will leave SCC unmodified.
Model this correctly (via an implicit use of SCC) and ensure SCC==1
before the barrier instruction (if the wave is the only one of the
workgroup, then it is the first).
---------
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
Added:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SOPInstructions.td
llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
Removed:
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index c052b076c21c3..ed3e4c8513e2b 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1402,6 +1402,10 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
performs subtraction only if the memory value is greater than or
equal to the data value.
+ llvm.amdgcn.s.barrier.signal.isfirst Provides access to the s_barrier_signal_first instruction;
+ additionally ensures that the result value is valid even when the
+ intrinsic is used from a wave that is not running in a workgroup.
+
llvm.amdgcn.s.getpc Provides access to the s_getpc_b64 instruction, but with the return value
sign-extended from the width of the underlying PC hardware register even on
processors where the s_getpc_b64 instruction returns a zero-extended value.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7e72f6ca478fd..672520390c8bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5918,6 +5918,9 @@ bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
const DebugLoc &DL = I.getDebugLoc();
Register CCReg = I.getOperand(0).getReg();
+ // Set SCC to true, in case the barrier instruction gets converted to a NOP.
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
+
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
.addImm(I.getOperand(2).getImm());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 586de433ea28a..b1e77a282e415 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5423,6 +5423,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
+ // Set SCC to true, in case the barrier instruction gets converted to a NOP.
+ BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
+ TII->get(AMDGPU::S_CMP_EQ_U32))
+ .addImm(0)
+ .addImm(0);
+ return BB;
+ }
case AMDGPU::GET_GROUPSTATICSIZE: {
assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e0a36758534d5..90e65a6950c0a 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -472,6 +472,7 @@ def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins),
def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins),
"", []>{
let Defs = [SCC];
+ let Uses = [M0, SCC];
let SchedRW = [WriteBarrier];
let isConvergent = 1;
}
@@ -487,6 +488,8 @@ def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (outs),
(ins SplitBarrier:$src0), "$src0", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst timm:$src0))]>{
let Defs = [SCC];
+ let Uses = [SCC];
+ let usesCustomInserter = 1;
let SchedRW = [WriteBarrier];
let isConvergent = 1;
}
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
index e4b16a3fa0040..f437dee253d00 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
@@ -374,7 +374,8 @@ body: |
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: V_NOP_e32 implicit $exec
- ; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
+ ; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
@@ -385,7 +386,8 @@ body: |
bb.1:
successors: %bb.2
V_NOP_e32 implicit $exec
- S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
+ S_CMP_EQ_U32 0, 0, implicit-def $scc
+ S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc
bb.2:
S_ENDPGM 0
@@ -437,6 +439,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: V_NOP_e32 implicit $exec
; CHECK-NEXT: $m0 = S_MOV_B32 -1
+ ; CHECK-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
@@ -449,7 +452,8 @@ body: |
successors: %bb.2
V_NOP_e32 implicit $exec
$m0 = S_MOV_B32 -1
- S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
+ S_CMP_EQ_U32 0, 0, implicit-def $scc
+ S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit $scc
bb.2:
S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
new file mode 100644
index 0000000000000..651d204f65b6c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+define i1 @func1() {
+; GFX12-SDAG-LABEL: func1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: func1:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ ret i1 %r
+}
+
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)
More information about the llvm-commits
mailing list