[llvm] 3bee9ba - AMDGPU/GFX12: Fix s_barrier_signal_isfirst for single-wave workgroups (#143634)

Thu Jun 19 11:22:52 PDT 2025

Author: Nicolai Hähnle
Date: 2025-06-19T11:22:49-07:00
New Revision: 3bee9ba0156ee130fa88379a5a89de0812936a3d

URL: https://github.com/llvm/llvm-project/commit/3bee9ba0156ee130fa88379a5a89de0812936a3d
DIFF: https://github.com/llvm/llvm-project/commit/3bee9ba0156ee130fa88379a5a89de0812936a3d.diff

LOG: AMDGPU/GFX12: Fix s_barrier_signal_isfirst for single-wave workgroups (#143634)

Barrier instructions are no-ops in single-wave workgroups. This includes
s_barrier_signal_isfirst, which will leave SCC unmodified.

Model this correctly (via an implicit use of SCC) and ensure SCC==1
before the barrier instruction (if the wave is the only one of the
workgroup, then it is the first).

---------

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>

Added: 
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll

Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir

Removed: 
    


################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index c052b076c21c3..ed3e4c8513e2b 100644

--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1402,6 +1402,10 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    performs subtraction only if the memory value is greater than or
                                                    equal to the data value.
 
+  llvm.amdgcn.s.barrier.signal.isfirst             Provides access to the s_barrier_signal_first instruction;
+                                                   additionally ensures that the result value is valid even when the
+                                                   intrinsic is used from a wave that is not running in a workgroup.
+
   llvm.amdgcn.s.getpc                              Provides access to the s_getpc_b64 instruction, but with the return value
                                                    sign-extended from the width of the underlying PC hardware register even on
                                                    processors where the s_getpc_b64 instruction returns a zero-extended value.

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7e72f6ca478fd..672520390c8bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5918,6 +5918,9 @@ bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
   const DebugLoc &DL = I.getDebugLoc();
   Register CCReg = I.getOperand(0).getReg();
 
+  // Set SCC to true, in case the barrier instruction gets converted to a NOP.
+  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
+
   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
       .addImm(I.getOperand(2).getImm());
 

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 586de433ea28a..b1e77a282e415 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5423,6 +5423,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent();
     return BB;
   }
+  case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
+    // Set SCC to true, in case the barrier instruction gets converted to a NOP.
+    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
+            TII->get(AMDGPU::S_CMP_EQ_U32))
+        .addImm(0)
+        .addImm(0);
+    return BB;
+  }
   case AMDGPU::GET_GROUPSTATICSIZE: {
     assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
            getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e0a36758534d5..90e65a6950c0a 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -472,6 +472,7 @@ def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins),
 def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins),
   "", []>{
   let Defs = [SCC];
+  let Uses = [M0, SCC];
   let SchedRW = [WriteBarrier];
   let isConvergent = 1;
 }
@@ -487,6 +488,8 @@ def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
 def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (outs),
   (ins SplitBarrier:$src0), "$src0", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst timm:$src0))]>{
   let Defs = [SCC];
+  let Uses = [SCC];
+  let usesCustomInserter = 1;
   let SchedRW = [WriteBarrier];
   let isConvergent = 1;
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
index e4b16a3fa0040..f437dee253d00 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
@@ -374,7 +374,8 @@ body: |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   V_NOP_e32 implicit $exec
-  ; CHECK-NEXT:   S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
+  ; CHECK-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
+  ; CHECK-NEXT:   S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   S_ENDPGM 0
@@ -385,7 +386,8 @@ body: |
   bb.1:
     successors: %bb.2
     V_NOP_e32 implicit $exec
-    S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
+    S_CMP_EQ_U32 0, 0, implicit-def $scc
+    S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc
 
   bb.2:
     S_ENDPGM 0
@@ -437,6 +439,7 @@ body: |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   V_NOP_e32 implicit $exec
   ; CHECK-NEXT:   $m0 = S_MOV_B32 -1
+  ; CHECK-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -449,7 +452,8 @@ body: |
     successors: %bb.2
     V_NOP_e32 implicit $exec
     $m0 = S_MOV_B32 -1
-    S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
+    S_CMP_EQ_U32 0, 0, implicit-def $scc
+    S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit $scc
 
   bb.2:
     S_ENDPGM 0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
new file mode 100644
index 0000000000000..651d204f65b6c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200  < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+define i1 @func1() {
+; GFX12-SDAG-LABEL: func1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_cmp_eq_u32 0, 0
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: func1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_cmp_eq_u32 0, 0
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+  ret i1 %r
+}
+
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)