[llvm] [AMDGPU] Allow amdgpu-waves-per-eu to lower target occupancy range (PR #168358)

Mon Nov 17 04:14:30 PST 2025

https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/168358

>From e412342219be578041e6a4f4e5e029c760762431 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Mon, 17 Nov 2025 11:51:29 +0000
Subject: [PATCH 1/2] Allow amdgpu-waves-per-eu to lower occ. target

---
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    | 38 +++++----
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  4 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        | 12 +++
 ...-work-group-size-overrides-waves-per-eu.ll | 61 --------------
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  | 25 +++---
 .../test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll |  4 +-
 ...ves-per-eu-hints-lower-occupancy-target.ll | 83 +++++++++++++++++++
 7 files changed, 135 insertions(+), 92 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index c7528f993da1e..5325fc22e603a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -190,23 +190,29 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
   // sizes limits the achievable maximum, and we aim to support enough waves per
   // EU so that we can concurrently execute all waves of a single workgroup of
   // maximum size on a CU.
-  std::pair<unsigned, unsigned> Default = {
+  std::pair<unsigned, unsigned> WavesPerEU = {
       getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
       getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
-  Default.first = std::min(Default.first, Default.second);
-
-  // Make sure requested minimum is within the default range and lower than the
-  // requested maximum. The latter must not violate target specification.
-  if (RequestedWavesPerEU.first < Default.first ||
-      RequestedWavesPerEU.first > Default.second ||
-      RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
-      RequestedWavesPerEU.second > getMaxWavesPerEU())
-    return Default;
-
-  // We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
-  RequestedWavesPerEU.second =
-      std::min(RequestedWavesPerEU.second, Default.second);
-  return RequestedWavesPerEU;
+  WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
+
+  // Requested minimum must not violate subtarget's specifications and be no
+  // greater than maximum.
+  if (RequestedWavesPerEU.first &&
+      (RequestedWavesPerEU.first < getMinWavesPerEU() ||
+       RequestedWavesPerEU.first > RequestedWavesPerEU.second))
+    return WavesPerEU;
+  // Requested maximum must not violate subtarget's specifications.
+  if (RequestedWavesPerEU.second > getMaxWavesPerEU())
+    return WavesPerEU;
+
+  // A requested maximum may limit both the final minimum and maximum, but
+  // not increase them. A requested minimum can either decrease or increase the
+  // default minimum.
+  WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second);
+  if (RequestedWavesPerEU.first)
+    WavesPerEU.first = RequestedWavesPerEU.first;
+  WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
+  return WavesPerEU;
 }
 
 std::pair<unsigned, unsigned>
@@ -225,7 +231,7 @@ std::pair<unsigned, unsigned>
 AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
                                unsigned LDSBytes, const Function &F) const {
   // Default minimum/maximum number of waves per execution unit.
-  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
+  std::pair<unsigned, unsigned> Default(0, getMaxWavesPerEU());
 
   // Requested minimum/maximum number of waves per execution unit.
   std::pair<unsigned, unsigned> Requested =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index ed03ef21b6dda..2133d7297ac75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -150,7 +150,9 @@ class AMDGPUSubtarget {
   /// Returns the target minimum/maximum number of waves per EU. This is based
   /// on the minimum/maximum number of \p RequestedWavesPerEU and further
   /// limited by the maximum achievable occupancy derived from the range of \p
-  /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
+  /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A minimum
+  /// requested waves/EU value of 0 indicates an intent to not restrict the
+  /// minimum target occupancy.
   std::pair<unsigned, unsigned>
   getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
                          std::pair<unsigned, unsigned> FlatWorkGroupSizes,
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 41bce31c6ebc0..4379fc7385d4f 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -225,3 +225,15 @@ entry:
   ret void
 }
 attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"}
+
+; At most 2 waves per execution unit.
+; CHECK-LABEL: {{^}}empty_at_most_2:
+; CHECK: SGPRBlocks: 12
+; CHECK: VGPRBlocks: 21
+; CHECK: NumSGPRsForWavesPerEU: 102
+; CHECK: NumVGPRsForWavesPerEU: 85
+define amdgpu_kernel void @empty_at_most_2() #13 {
+entry:
+  ret void
+}
+attributes #13 = {"amdgpu-waves-per-eu"="0,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
deleted file mode 100644
index 67061bcb2a785..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s
-
-; Both of these kernels have the same value for
-; amdgpu-flat-work-group-size, except one explicitly sets it. This is
-; a program visible property which should always take precedence over
-; the amdgpu-waves-per-eu optimization hint.
-;
-; The range is incompatible with the amdgpu-waves-per-eu value, so the
-; flat work group size should take precedence implying a requirement
-; to support 1024 size workgroups (which exceeds the available LDS
-; amount).
-
-; CHECK-NOT: @no_flat_workgroup_size.stack
-; CHECK-NOT: @explicit_default_workgroup_size.stack
-
-; CHECK-LABEL: @no_flat_workgroup_size(
-; CHECK: alloca [5 x i32]
-; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
-define amdgpu_kernel void @no_flat_workgroup_size(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
-entry:
-  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %0 = load i32, ptr addrspace(1) %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
-  store i32 4, ptr addrspace(5) %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
-  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
-  store i32 5, ptr addrspace(5) %arrayidx3, align 4
-  %2 = load i32, ptr addrspace(5) %stack, align 4
-  store i32 %2, ptr addrspace(1) %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
-  %3 = load i32, ptr addrspace(5) %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
-  store i32 %3, ptr addrspace(1) %arrayidx13
-  ret void
-}
-
-; CHECK-LABEL: @explicit_default_workgroup_size(
-; CHECK: alloca [5 x i32]
-; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
-define amdgpu_kernel void @explicit_default_workgroup_size(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
-entry:
-  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %0 = load i32, ptr addrspace(1) %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
-  store i32 4, ptr addrspace(5) %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
-  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
-  store i32 5, ptr addrspace(5) %arrayidx3, align 4
-  %2 = load i32, ptr addrspace(5) %stack, align 4
-  store i32 %2, ptr addrspace(1) %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
-  %3 = load i32, ptr addrspace(5) %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
-  store i32 %3, ptr addrspace(1) %arrayidx13
-  ret void
-}
-
-attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
-attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index 0be314772abdb..3b5a7cd3707b6 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
 
-; Check propagation of amdgpu-flat-work-group-size attribute.
+; Check propagation of amdgpu-waves-per-eu attribute.
 
 ; Called from a single kernel with 1,8
 define internal void @default_to_1_8_a() {
@@ -216,30 +216,31 @@ define internal i32 @bitcasted_function() {
   ret i32 0
 }
 
-define internal void @called_from_invalid_bounds_0() {
-; CHECK-LABEL: define internal void @called_from_invalid_bounds_0
+define internal void @called_without_min_waves() {
+; CHECK-LABEL: define internal void @called_without_min_waves
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
 }
 
-define internal void @called_from_invalid_bounds_1() {
-; CHECK-LABEL: define internal void @called_from_invalid_bounds_1
+define internal void @called_from_invalid_bounds() {
+; CHECK-LABEL: define internal void @called_from_invalid_bounds
 ; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
 }
 
-; Invalid range for amdgpu-waves-per-eu
-define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 {
-; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8
+; Kernel does not specify a minimum desired occupancy so will end up with the
+; default minimum (and the requested maximum).
+define amdgpu_kernel void @kernel_without_min_waves() #9 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_without_min_waves
 ; CHECK-SAME: () #[[ATTR1]] {
-; CHECK-NEXT:    call void @called_from_invalid_bounds_0()
+; CHECK-NEXT:    call void @called_without_min_waves()
 ; CHECK-NEXT:    ret void
 ;
-  call void @called_from_invalid_bounds_0()
+  call void @called_without_min_waves()
   ret void
 }
 
@@ -247,10 +248,10 @@ define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 {
 define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123
 ; CHECK-SAME: () #[[ATTR11:[0-9]+]] {
-; CHECK-NEXT:    call void @called_from_invalid_bounds_1()
+; CHECK-NEXT:    call void @called_from_invalid_bounds()
 ; CHECK-NEXT:    ret void
 ;
-  call void @called_from_invalid_bounds_1()
+  call void @called_from_invalid_bounds()
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll
index 9b992a35c3303..b14bb8c292ad7 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll
@@ -575,5 +575,5 @@ define amdgpu_kernel void @use1024vgprs_codegen(ptr %p) #1281 {
   ret void
 }
 
-attributes #2561 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="1" }
-attributes #1281 = { nounwind "amdgpu-flat-work-group-size"="128,128" "amdgpu-waves-per-eu"="1" }
+attributes #2561 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
+attributes #1281 = { nounwind "amdgpu-flat-work-group-size"="128,128" }
diff --git a/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll b/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll
new file mode 100644
index 0000000000000..b0e386cba5aff
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll
@@ -0,0 +1,83 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s
+
+; All kernels have the same value for amdgpu-flat-work-group-size, except the
+; second and third kernels explicitly set it. The first and second kernel also
+; have the same final waves/EU range, except the second kernel explicitly sets
+; it with amdgpu-waves-per-eu. 
+;
+; The third kernel hints the compiler that a maximum occupancy of 1 is desired
+; with amdgpu-waves-per-eu, so the alloca promotion pass is free to use more LDS
+; space than when limiting itself to support the maximum default occupancy of
+; 10. This does not break the ABI requirement to support the full possible range
+; of workgroup sizes 
+
+; CHECK-NOT: @no_attributes.stack
+; CHECK-NOT: @explicit_default_workgroup_size_and_waves.stack
+
+; CHECK-LABEL: @no_attributes(
+; CHECK: alloca [5 x i32]
+; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
+define amdgpu_kernel void @no_attributes(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
+entry:
+  %stack = alloca [5 x i32], align 4, addrspace(5)
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
+  ret void
+}
+
+; CHECK-LABEL: @explicit_default_workgroup_size_and_waves(
+; CHECK: alloca [5 x i32]
+; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
+define amdgpu_kernel void @explicit_default_workgroup_size_and_waves(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4, addrspace(5)
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
+  ret void
+}
+
+; CHECK-LABEL: @explicit_low_occupancy_requested(
+; CHECK-NOT: alloca [5 x i32]
+define amdgpu_kernel void @explicit_low_occupancy_requested(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
+entry:
+  %stack = alloca [5 x i32], align 4, addrspace(5)
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
+  ret void
+}
+
+attributes #0 = { "amdgpu-waves-per-eu"="4,10" "amdgpu-flat-work-group-size"="1,1024" }
+attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }

>From ae902de6f383a5b3f98992d2ff56be4926168bbb Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <11032120+lucas-rami at users.noreply.github.com>
Date: Mon, 17 Nov 2025 13:14:21 +0100
Subject: [PATCH 2/2] Clarify test comment

---
 .../AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll b/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll
index b0e386cba5aff..46733a8757ad8 100644
--- a/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll
@@ -3,13 +3,14 @@
 ; All kernels have the same value for amdgpu-flat-work-group-size, except the
 ; second and third kernels explicitly set it. The first and second kernel also
 ; have the same final waves/EU range, except the second kernel explicitly sets
-; it with amdgpu-waves-per-eu. 
+; it with amdgpu-waves-per-eu. As a result the first and second kernels are
+; treated identically. 
 ;
 ; The third kernel hints the compiler that a maximum occupancy of 1 is desired
 ; with amdgpu-waves-per-eu, so the alloca promotion pass is free to use more LDS
 ; space than when limiting itself to support the maximum default occupancy of
 ; 10. This does not break the ABI requirement to support the full possible range
-; of workgroup sizes 
+; of workgroup sizes as specified by amdgpu-flat-work-group-size.
 
 ; CHECK-NOT: @no_attributes.stack
 ; CHECK-NOT: @explicit_default_workgroup_size_and_waves.stack