[Mlir-commits] [llvm] [mlir] [AMDGPU] Emit s_barrier_signal for s.barrier.signal.var when able (PR #191028)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Apr 8 11:46:27 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-llvm
Author: Krzysztof Drewniak (krzysz00)
<details>
<summary>Changes</summary>
When the member count is 0 (which causes the barrier's member count to be re-used from a previous barrier initialization or s.barrier.signal.var) and the barrier is constant, we can represent named barrier signaling with the m0-less form. Enable this behavior.
Assisted by Opus 4.6 which drafted the initial implementation.
---
Full diff: https://github.com/llvm/llvm-project/pull/191028.diff
5 Files Affected:
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+18)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31-1)
- (modified) llvm/test/CodeGen/AMDGPU/s-barrier.ll (+31)
- (modified) mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td (+3-1)
``````````diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f576972183eca..9528fb2b446bc 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -292,6 +292,8 @@ def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signa
// void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %barrier, i32 %memberCnt)
// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined.
+// If %memberCnt is 0, the member count is retained from the previous
+// s_barrier_init or s_barrier_signal operation.
def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index f5747488225c5..ffb2a12892ca2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -7145,6 +7145,24 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
const MachineOperand &BarOp = I.getOperand(1);
const MachineOperand &CntOp = I.getOperand(2);
+ // A member count of 0 means "keep existing member count". That plus a known
+ // constant value for the barrier ID lets us use the immarg form.
+ if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
+ std::optional<int64_t> CntImm =
+ getIConstantVRegSExtVal(CntOp.getReg(), *MRI);
+ if (CntImm && *CntImm == 0) {
+ std::optional<int64_t> BarValImm =
+ getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
+ if (BarValImm) {
+ auto BarID = ((*BarValImm) >> 4) & 0x3F;
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
+ .addImm(BarID);
+ I.eraseFromParent();
+ return true;
+ }
+ }
+ }
+
// BarID = (BarOp >> 4) & 0x3F
Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2e631d2f4a55e..b1e3334b2e32b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12054,8 +12054,38 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain),
0);
- case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var: {
+ // Member count of 0 means to re-use a previous member count,
+ // which, if the named barrier is statically chosen, means we can use
+ // the immarg form. Otherwisee, fall through to constructiong M0 as for
+ // s_barrier_init.
+ SDValue CntOp = Op->getOperand(3);
+ auto *CntC = dyn_cast<ConstantSDNode>(CntOp);
+ if (CntC && CntC->isZero()) {
+ SDValue Chain = Op->getOperand(0);
+ SDValue BarOp = Op->getOperand(2);
+ SmallVector<SDValue, 2> Ops;
+
+ std::optional<uint64_t> BarVal;
+ if (auto *C = dyn_cast<ConstantSDNode>(BarOp))
+ BarVal = C->getZExtValue();
+ else if (auto *GA = dyn_cast<GlobalAddressSDNode>(BarOp))
+ if (auto Addr = AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(
+ *GA->getGlobal()))
+ BarVal = *Addr + GA->getOffset();
+
+ if (BarVal) {
+ unsigned BarID = (*BarVal >> 4) & 0x3F;
+ Ops.push_back(DAG.getTargetConstant(BarID, DL, MVT::i32));
+ Ops.push_back(Chain);
+ auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
+ Op->getVTList(), Ops);
+ return SDValue(NewMI, 0);
+ }
+ }
+ [[fallthrough]];
+ }
+ case Intrinsic::amdgcn_s_barrier_init: {
// these two intrinsics have two operands: barrier pointer and member count
SDValue Chain = Op->getOperand(0);
SmallVector<SDValue, 2> Ops;
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index 35b86998c9cac..6c438ed94c863 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -258,6 +258,37 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
ret void
}
+define void @signal_var_cnt0_const_bar() {
+; GFX12-LABEL: signal_var_cnt0_const_bar:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_barrier_signal 2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 0)
+ ret void
+}
+
+define void @signal_var_cnt0_dynamic_bar(ptr addrspace(3) inreg %bar) {
+; GFX12-LABEL: signal_var_cnt0_dynamic_bar:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshr_b32 s0, s0, 4
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT: s_and_b32 m0, s0, 63
+; GFX12-NEXT: s_barrier_signal m0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %bar, i32 0)
+ ret void
+}
+
define amdgpu_ps void @test_barrier_leave_write_to_scc(i32 inreg %val, ptr addrspace(1) %out) {
; GFX12-LABEL: test_barrier_leave_write_to_scc:
; GFX12: ; %bb.0:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 7305de2049ee2..b13206ce5c342 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -346,7 +346,7 @@ def ROCDL_DsBpermuteOp : ROCDL_ConcreteNonMemIntrOp<"ds_bpermute", [], 1>,
Each lane reads the value of `src` from the lane whose byte address is
given by `index` (i.e. lane id = `index / 4`).
-
+
This is “backward” (pull) in contrast to `ds_permute_b32`, which is
“forward” (push/scatter).
@@ -574,6 +574,8 @@ def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [],
let description = [{
Available on gfx1250+.
+ If `memberCnt` is 0, the member count is retained from a previous initialization.
+
Example:
```mlir
// Signal a named barrier with variable ID.
``````````
</details>
https://github.com/llvm/llvm-project/pull/191028
More information about the Mlir-commits
mailing list