[llvm] [AMDGPU] Allow 0 as min./max. of amdgpu-waves-per-eu (PR #138284)

Lucas Ramirez via llvm-commits llvm-commits at lists.llvm.org
Fri May 2 07:22:34 PDT 2025


https://github.com/lucas-rami created https://github.com/llvm/llvm-project/pull/138284

Clang's attribute reference specifies, for the "amdgpu-waves-per-eu" attribute, that "_Passing 0, 0 as \<min\>, \<max\> implies the default behavior_". However, the backend currently treats a minimum or maximum of 0 as an invalid bound that makes the whole attribute invalid/ignored.

This makes the backend treat a 0 in the range as per the documentation. In particular, this allows a user to specify a maximum desired number of waves/EU without specifying a minimum (e.g., "amdgpu-waves-per-eu"="0,4"), which is not currently feasible.

The following equivalences hold ($$0<N<getMaxWavesPerEU()$$):
- "amdgpu-waves-per-eu"="N,0" and "amdgpu-waves-per-eu"="N".
- "amdgpu-waves-per-eu"="0,0" and no attribute.   

>From 2b4ef1fba8fc9bedb0be9fec725ecd33e0bbbae2 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 2 May 2025 13:57:35 +0000
Subject: [PATCH] Allow 0 as min/max number of waves per EU

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  3 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    | 24 ++++++++----
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  4 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        | 38 +++++++++++++++++++
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  | 28 +++++++-------
 5 files changed, 72 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index b9ce8dc0c5cdb..0bbbe766968fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1125,8 +1125,7 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
       indicateOptimisticFixpoint();
     };
 
-    std::pair<unsigned, unsigned> MaxWavesPerEURange{
-        1U, InfoCache.getMaxWavesPerEU(*F)};
+    std::pair<unsigned, unsigned> MaxWavesPerEURange{0, 0};
 
     // If the attribute exists, we will honor it if it is not the default.
     if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 563605f964cc6..4212d97eb9404 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -191,17 +191,25 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
       getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
   Default.first = std::min(Default.first, Default.second);
 
-  // Make sure requested minimum is less than requested maximum.
+  if (RequestedWavesPerEU.first) {
+    // Requested minimum must not violate subtarget's specifications.
+    if (RequestedWavesPerEU.first < Default.first)
+      return Default;
+    // Requested maximum must be no lesser than minimum.
+    if (RequestedWavesPerEU.second &&
+        RequestedWavesPerEU.first > RequestedWavesPerEU.second)
+      return Default;
+  }
+  // Requested maximum must not violate subtarget's specifications.
   if (RequestedWavesPerEU.second &&
-      RequestedWavesPerEU.first > RequestedWavesPerEU.second)
-    return Default;
-
-  // Make sure requested values do not violate subtarget's specifications and
-  // are compatible with values implied by minimum/maximum flat workgroup sizes.
-  if (RequestedWavesPerEU.first < Default.first ||
       RequestedWavesPerEU.second > Default.second)
     return Default;
 
+  // Replace unspecified bounds in the request with the default bounds.
+  if (!RequestedWavesPerEU.first)
+    RequestedWavesPerEU.first = Default.first;
+  if (!RequestedWavesPerEU.second)
+    RequestedWavesPerEU.second = Default.second;
   return RequestedWavesPerEU;
 }
 
@@ -220,7 +228,7 @@ std::pair<unsigned, unsigned>
 AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
                                unsigned LDSBytes, const Function &F) const {
   // Default minimum/maximum number of waves per execution unit.
-  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
+  std::pair<unsigned, unsigned> Default(0, 0);
 
   // Requested minimum/maximum number of waves per execution unit.
   std::pair<unsigned, unsigned> Requested =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 91fe2a69bc0b7..1cbb6a7b1ad43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -119,7 +119,9 @@ class AMDGPUSubtarget {
   /// Returns the target minimum/maximum number of waves per EU. This is based
   /// on the minimum/maximum number of \p RequestedWavesPerEU and further
   /// limited by the maximum achievable occupancy derived from the range of \p
-  /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
+  /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A
+  /// minimum/maximum requested waves/EU value of 0 indicates an intent to not
+  /// restrict the corresponding bound.
   std::pair<unsigned, unsigned>
   getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
                          std::pair<unsigned, unsigned> FlatWorkGroupSizes,
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 4507fd5865989..d8827d0405295 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -200,3 +200,41 @@ entry:
   ret void
 }
 attributes #10 = {"amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2,2"}
+
+; At most 2 waves per execution unit.
+; CHECK-LABEL: {{^}}empty_at_most_2:
+; CHECK: SGPRBlocks: 12
+; CHECK: VGPRBlocks: 21
+; CHECK: NumSGPRsForWavesPerEU: 102
+; CHECK: NumVGPRsForWavesPerEU: 85
+define amdgpu_kernel void @empty_at_most_2() #11 {
+entry:
+  ret void
+}
+attributes #11 = {"amdgpu-waves-per-eu"="0,2"}
+
+; Exactly 1024 workitems (limits occupancy to 8) and at least 5 waves per execution unit.
+; "amdgpu-waves-per-eu"="5,0" should have the same effect as "amdgpu-waves-per-eu"="5".
+; CHECK-LABEL: {{^}}empty_workitems_exactly_1024_waves_at_least_5:
+; CHECK: SGPRBlocks: 8
+; CHECK: VGPRBlocks: 7
+; CHECK: NumSGPRsForWavesPerEU: 65
+; CHECK: NumVGPRsForWavesPerEU: 29
+define amdgpu_kernel void @empty_workitems_exactly_1024_waves_at_least_5() #12 {
+entry:
+  ret void
+}
+attributes #12 = {"amdgpu-waves-per-eu"="5,0" "amdgpu-flat-work-group-size"="1024,1024"}
+
+; Unrestricted number of waves per execution unit.
+; "amdgpu-waves-per-eu"="0,0" should have the same effect as not providing the attribute.
+; CHECK-LABEL: {{^}}empty_default_waves:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @empty_default_waves() #13 {
+entry:
+  ret void
+}
+attributes #13 = {"amdgpu-waves-per-eu"="0,0"}
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index ae114f3213d8f..967cc764ea19c 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
 
-; Check propagation of amdgpu-flat-work-group-size attribute.
+; Check propagation of amdgpu-waves-per-eu attribute.
 
 ; Called from a single kernel with 1,8
 define internal void @default_to_1_8_a() {
@@ -216,30 +216,30 @@ define internal i32 @bitcasted_function() {
   ret i32 0
 }
 
-define internal void @called_from_invalid_bounds_0() {
-; CHECK-LABEL: define internal void @called_from_invalid_bounds_0
-; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
+define internal void @called_without_min_waves() {
+; CHECK-LABEL: define internal void @called_without_min_waves
+; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
 }
 
-define internal void @called_from_invalid_bounds_1() {
-; CHECK-LABEL: define internal void @called_from_invalid_bounds_1
-; CHECK-SAME: () #[[ATTR10]] {
+define internal void @called_from_invalid_bounds() {
+; CHECK-LABEL: define internal void @called_from_invalid_bounds
+; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
 }
 
-; Invalid range for amdgpu-waves-per-eu
-define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 {
-; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8
+; amdgpu-waves-per-eu range only provides a maximum.
+define amdgpu_kernel void @kernel_0_8() #9 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_0_8
 ; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    call void @called_from_invalid_bounds_0()
+; CHECK-NEXT:    call void @called_without_min_waves()
 ; CHECK-NEXT:    ret void
 ;
-  call void @called_from_invalid_bounds_0()
+  call void @called_without_min_waves()
   ret void
 }
 
@@ -247,10 +247,10 @@ define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 {
 define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123
 ; CHECK-SAME: () #[[ATTR11:[0-9]+]] {
-; CHECK-NEXT:    call void @called_from_invalid_bounds_1()
+; CHECK-NEXT:    call void @called_from_invalid_bounds()
 ; CHECK-NEXT:    ret void
 ;
-  call void @called_from_invalid_bounds_1()
+  call void @called_from_invalid_bounds()
   ret void
 }
 



More information about the llvm-commits mailing list