[llvm] [AMDGPU] Relax workgroup fences for single-wave workgroups (PR #187673)

Fri Mar 20 03:58:10 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Barbara  (barbara-amd)

<details>
<summary>Changes</summary>

When the whole workgroup fits in a single wavefront (i.e. at most one wave), a fence syncscope("workgroup") can be lowered to syncscope("wavefront"): there are no other waves in the workgroup for that fence to synchronize across, so wavefront scope is sufficient.

---
Full diff: https://github.com/llvm/llvm-project/pull/187673.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+21) 
- (added) llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll (+42) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8a80101d79a8e..38ad237747f46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
@@ -256,6 +257,7 @@ class AMDGPUCodeGenPrepareImpl
   bool visitSelectInst(SelectInst &I);
   bool visitPHINode(PHINode &I);
   bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
+  bool visitFenceInst(FenceInst &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitFMinLike(IntrinsicInst &I);
@@ -1998,6 +2000,25 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
   return true;
 }
 
+// When all waves of the workgroup fit in one wave, workgroup fences can be
+// lowered to wavefront scope.
+bool AMDGPUCodeGenPrepareImpl::visitFenceInst(FenceInst &I) {
+  unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
+  if (WGMaxSize > ST.getWavefrontSize())
+    return false;
+
+  SyncScope::ID WorkgroupSSID =
+      F.getContext().getOrInsertSyncScopeID("workgroup");
+  SyncScope::ID WavefrontSSID =
+      F.getContext().getOrInsertSyncScopeID("wavefront");
+
+  if (I.getSyncScopeID() != WorkgroupSSID)
+    return false;
+
+  I.setSyncScopeID(WavefrontSSID);
+  return true;
+}
+
 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
   Intrinsic::ID IID = I.getIntrinsicID();
   switch (IID) {
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
new file mode 100644
index 0000000000000..b8aecf451e31c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize32 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE32
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE64
+
+define amdgpu_kernel void @single_wave_workgroup(ptr addrspace(1) %out) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @single_wave_workgroup(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    fence syncscope("wavefront") acq_rel
+; CHECK-NEXT:    ret void
+;
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @single_wave_workgroup64(ptr addrspace(1) %out) #1 {
+; WAVE32-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
+; WAVE32-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; WAVE32-NEXT:    fence syncscope("workgroup") acq_rel
+; WAVE32-NEXT:    ret void
+;
+; WAVE64-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
+; WAVE64-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; WAVE64-NEXT:    fence syncscope("wavefront") acq_rel
+; WAVE64-NEXT:    ret void
+;
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @multi_wave_workgroup(ptr addrspace(1) %out) #2 {
+; CHECK-LABEL: define amdgpu_kernel void @multi_wave_workgroup(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    fence syncscope("workgroup") acq_rel
+; CHECK-NEXT:    ret void
+;
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }

``````````

</details>


https://github.com/llvm/llvm-project/pull/187673