[llvm] [AMDGPU] Relax workgroup fences for single-wave workgroups (PR #187673)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 20 03:58:10 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Barbara (barbara-amd)
<details>
<summary>Changes</summary>
When the whole workgroup fits in a single wavefront (i.e. at most one wave), a fence syncscope("workgroup") can be lowered to syncscope("wavefront"): there are no other waves in the workgroup for that fence to synchronize across, so wavefront scope is sufficient.
---
Full diff: https://github.com/llvm/llvm-project/pull/187673.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+21)
- (added) llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll (+42)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8a80101d79a8e..38ad237747f46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -26,6 +26,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ValueHandle.h"
@@ -256,6 +257,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitSelectInst(SelectInst &I);
bool visitPHINode(PHINode &I);
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
+ bool visitFenceInst(FenceInst &I);
bool visitIntrinsicInst(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
@@ -1998,6 +2000,25 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
return true;
}
+// When all waves of the workgroup fit in one wave, workgroup fences can be
+// lowered to wavefront scope.
+bool AMDGPUCodeGenPrepareImpl::visitFenceInst(FenceInst &I) {
+ unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
+ if (WGMaxSize > ST.getWavefrontSize())
+ return false;
+
+ SyncScope::ID WorkgroupSSID =
+ F.getContext().getOrInsertSyncScopeID("workgroup");
+ SyncScope::ID WavefrontSSID =
+ F.getContext().getOrInsertSyncScopeID("wavefront");
+
+ if (I.getSyncScopeID() != WorkgroupSSID)
+ return false;
+
+ I.setSyncScopeID(WavefrontSSID);
+ return true;
+}
+
bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
Intrinsic::ID IID = I.getIntrinsicID();
switch (IID) {
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
new file mode 100644
index 0000000000000..b8aecf451e31c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize32 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE32
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE64
+
+define amdgpu_kernel void @single_wave_workgroup(ptr addrspace(1) %out) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @single_wave_workgroup(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: fence syncscope("wavefront") acq_rel
+; CHECK-NEXT: ret void
+;
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @single_wave_workgroup64(ptr addrspace(1) %out) #1 {
+; WAVE32-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
+; WAVE32-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; WAVE32-NEXT: fence syncscope("workgroup") acq_rel
+; WAVE32-NEXT: ret void
+;
+; WAVE64-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
+; WAVE64-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; WAVE64-NEXT: fence syncscope("wavefront") acq_rel
+; WAVE64-NEXT: ret void
+;
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @multi_wave_workgroup(ptr addrspace(1) %out) #2 {
+; CHECK-LABEL: define amdgpu_kernel void @multi_wave_workgroup(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: fence syncscope("workgroup") acq_rel
+; CHECK-NEXT: ret void
+;
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
``````````
</details>
https://github.com/llvm/llvm-project/pull/187673
More information about the llvm-commits
mailing list