[llvm] a96ec01 - [AMDGPU] Optimize out s_barrier_signal/_wait (#116993)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 26 01:04:36 PST 2024
Author: Piotr Sobczak
Date: 2024-11-26T10:04:32+01:00
New Revision: a96ec01e1a269b663ccc1dadc2f4429fd0df887d
URL: https://github.com/llvm/llvm-project/commit/a96ec01e1a269b663ccc1dadc2f4429fd0df887d
DIFF: https://github.com/llvm/llvm-project/commit/a96ec01e1a269b663ccc1dadc2f4429fd0df887d.diff
LOG: [AMDGPU] Optimize out s_barrier_signal/_wait (#116993)
Extend the optimization that converts s_barrier to wave_barrier (nop)
when the number of work items is not larger than wave size.
This handles the "split barrier" form of s_barrier where the barrier
is represented by separate intrinsics (s_barrier_signal/s_barrier_wait).
Note: the version where s_barrier is used in gfx12 (and later split)
has the optimization already, but some front-ends may prefer to use
split intrinsics and this is being addressed by the patch.
Added:
llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 39bec6c7f2f56d..eadce16ae0a9ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1878,19 +1878,25 @@ bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
}
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
if (TM.getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
if (WGSize <= STI.getWavefrontSize()) {
- MachineBasicBlock *MBB = MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
+ // If the workgroup fits in a wave, remove s_barrier_signal and lower
+ // s_barrier/s_barrier_wait to wave_barrier.
+ if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
+ IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
+ }
MI.eraseFromParent();
return true;
}
}
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- if (STI.hasSplitBarriers()) {
+ if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
@@ -2207,6 +2213,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_init_whole_wave:
return selectInitWholeWave(I);
case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_s_barrier_signal:
+ case Intrinsic::amdgcn_s_barrier_wait:
return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f326416a324178..3f0845864336fe 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9614,18 +9614,26 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
- case Intrinsic::amdgcn_s_barrier: {
+ case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_s_barrier_signal:
+ case Intrinsic::amdgcn_s_barrier_wait: {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
- if (WGSize <= ST.getWavefrontSize())
- return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
- Op.getOperand(0)),
- 0);
+ if (WGSize <= ST.getWavefrontSize()) {
+ // If the workgroup fits in a wave, remove s_barrier_signal and lower
+ // s_barrier/s_barrier_wait to wave_barrier.
+ if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
+ return Op.getOperand(0);
+ else
+ return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
+ MVT::Other, Op.getOperand(0)),
+ 0);
+ }
}
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- if (ST.hasSplitBarriers()) {
+ if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
SDValue K =
DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
SDValue BarSignal =
diff --git a/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll b/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll
new file mode 100644
index 00000000000000..d26d406df52201
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define amdgpu_kernel void @signal_unknown_wgs() {
+; CHECK-LABEL: signal_unknown_wgs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_barrier_signal -1
+; CHECK-NEXT: s_endpgm
+ tail call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ ret void
+}
+
+define amdgpu_kernel void @signal_flat_wgs_attr_32_128() #1 {
+; CHECK-LABEL: signal_flat_wgs_attr_32_128:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_barrier_signal -1
+; CHECK-NEXT: s_endpgm
+ tail call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ ret void
+}
+
+define amdgpu_kernel void @signal_flat_wgs_attr_16_32() #2 {
+; CHECK-LABEL: signal_flat_wgs_attr_16_32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_endpgm
+ tail call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ ret void
+}
+
+
+define amdgpu_kernel void @wait_unknown_wgs() {
+; CHECK-LABEL: wait_unknown_wgs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_barrier_wait -1
+; CHECK-NEXT: s_endpgm
+ tail call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ ret void
+}
+
+define amdgpu_kernel void @wait_flat_wgs_attr_32_128() #1 {
+; CHECK-LABEL: wait_flat_wgs_attr_32_128:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_barrier_wait -1
+; CHECK-NEXT: s_endpgm
+ tail call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ ret void
+}
+
+define amdgpu_kernel void @wait_flat_wgs_attr_16_32() #2 {
+; CHECK-LABEL: wait_flat_wgs_attr_16_32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ; wave barrier
+; CHECK-NEXT: s_endpgm
+ tail call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier.signal(i32 immarg) #0
+declare void @llvm.amdgcn.s.barrier.wait(i16 immarg) #0
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="32,128" }
+attributes #2 = { nounwind "amdgpu-flat-work-group-size"="16,32" }
More information about the llvm-commits
mailing list