[llvm-branch-commits] [llvm] 7633143 - Revert "[AMDGPU] Allow amdgpu-waves-per-eu to lower target occupancy range (#…"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jan 23 00:26:09 PST 2026
Author: Jan Patrick Lehr
Date: 2026-01-23T09:26:04+01:00
New Revision: 7633143429b8194feef72ce72c9082e9af92611d
URL: https://github.com/llvm/llvm-project/commit/7633143429b8194feef72ce72c9082e9af92611d
DIFF: https://github.com/llvm/llvm-project/commit/7633143429b8194feef72ce72c9082e9af92611d.diff
LOG: Revert "[AMDGPU] Allow amdgpu-waves-per-eu to lower target occupancy range (#…"
This reverts commit 967aeecdaa7db58db4cc896823b0327636c7219c.
Added:
llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll
Removed:
llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1ff378c72628f..5ca8ee22306f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -190,29 +190,23 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
// sizes limits the achievable maximum, and we aim to support enough waves per
// EU so that we can concurrently execute all waves of a single workgroup of
// maximum size on a CU.
- std::pair<unsigned, unsigned> WavesPerEU = {
+ std::pair<unsigned, unsigned> Default = {
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
- WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
-
- // Requested minimum must not violate subtarget's specifications and be no
- // greater than maximum.
- if (RequestedWavesPerEU.first &&
- (RequestedWavesPerEU.first < getMinWavesPerEU() ||
- RequestedWavesPerEU.first > RequestedWavesPerEU.second))
- return WavesPerEU;
- // Requested maximum must not violate subtarget's specifications.
- if (RequestedWavesPerEU.second > getMaxWavesPerEU())
- return WavesPerEU;
-
- // A requested maximum may limit both the final minimum and maximum, but
- // not increase them. A requested minimum can either decrease or increase the
- // default minimum.
- WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second);
- if (RequestedWavesPerEU.first)
- WavesPerEU.first = RequestedWavesPerEU.first;
- WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
- return WavesPerEU;
+ Default.first = std::min(Default.first, Default.second);
+
+ // Make sure requested minimum is within the default range and lower than the
+ // requested maximum. The latter must not violate target specification.
+ if (RequestedWavesPerEU.first < Default.first ||
+ RequestedWavesPerEU.first > Default.second ||
+ RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
+ RequestedWavesPerEU.second > getMaxWavesPerEU())
+ return Default;
+
+ // We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
+ RequestedWavesPerEU.second =
+ std::min(RequestedWavesPerEU.second, Default.second);
+ return RequestedWavesPerEU;
}
std::pair<unsigned, unsigned>
@@ -231,7 +225,7 @@ std::pair<unsigned, unsigned>
AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
unsigned LDSBytes, const Function &F) const {
// Default minimum/maximum number of waves per execution unit.
- std::pair<unsigned, unsigned> Default(0, getMaxWavesPerEU());
+ std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
// Requested minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Requested =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 2dbd56e792d4a..09df36e953129 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -180,9 +180,7 @@ class AMDGPUSubtarget {
/// Returns the target minimum/maximum number of waves per EU. This is based
/// on the minimum/maximum number of \p RequestedWavesPerEU and further
/// limited by the maximum achievable occupancy derived from the range of \p
- /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A minimum
- /// requested waves/EU value of 0 indicates an intent to not restrict the
- /// minimum target occupancy.
+ /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
std::pair<unsigned, unsigned>
getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
std::pair<unsigned, unsigned> FlatWorkGroupSizes,
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 4379fc7385d4f..41bce31c6ebc0 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -225,15 +225,3 @@ entry:
ret void
}
attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"}
-
-; At most 2 waves per execution unit.
-; CHECK-LABEL: {{^}}empty_at_most_2:
-; CHECK: SGPRBlocks: 12
-; CHECK: VGPRBlocks: 21
-; CHECK: NumSGPRsForWavesPerEU: 102
-; CHECK: NumVGPRsForWavesPerEU: 85
-define amdgpu_kernel void @empty_at_most_2() #13 {
-entry:
- ret void
-}
-attributes #13 = {"amdgpu-waves-per-eu"="0,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
new file mode 100644
index 0000000000000..67061bcb2a785
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
@@ -0,0 +1,61 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s
+
+; Both of these kernels have the same value for
+; amdgpu-flat-work-group-size, except one explicitly sets it. This is
+; a program visible property which should always take precedence over
+; the amdgpu-waves-per-eu optimization hint.
+;
+; The range is incompatible with the amdgpu-waves-per-eu value, so the
+; flat work group size should take precedence implying a requirement
+; to support 1024 size workgroups (which exceeds the available LDS
+; amount).
+
+; CHECK-NOT: @no_flat_workgroup_size.stack
+; CHECK-NOT: @explicit_default_workgroup_size.stack
+
+; CHECK-LABEL: @no_flat_workgroup_size(
+; CHECK: alloca [5 x i32]
+; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
+define amdgpu_kernel void @no_flat_workgroup_size(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4, addrspace(5)
+ %0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %3, ptr addrspace(1) %arrayidx13
+ ret void
+}
+
+; CHECK-LABEL: @explicit_default_workgroup_size(
+; CHECK: alloca [5 x i32]
+; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
+define amdgpu_kernel void @explicit_default_workgroup_size(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
+entry:
+ %stack = alloca [5 x i32], align 4, addrspace(5)
+ %0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %3, ptr addrspace(1) %arrayidx13
+ ret void
+}
+
+attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
+attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index 3b5a7cd3707b6..0be314772abdb 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
-; Check propagation of amdgpu-waves-per-eu attribute.
+; Check propagation of amdgpu-flat-work-group-size attribute.
; Called from a single kernel with 1,8
define internal void @default_to_1_8_a() {
@@ -216,31 +216,30 @@ define internal i32 @bitcasted_function() {
ret i32 0
}
-define internal void @called_without_min_waves() {
-; CHECK-LABEL: define internal void @called_without_min_waves
+define internal void @called_from_invalid_bounds_0() {
+; CHECK-LABEL: define internal void @called_from_invalid_bounds_0
; CHECK-SAME: () #[[ATTR1]] {
; CHECK-NEXT: ret void
;
ret void
}
-define internal void @called_from_invalid_bounds() {
-; CHECK-LABEL: define internal void @called_from_invalid_bounds
+define internal void @called_from_invalid_bounds_1() {
+; CHECK-LABEL: define internal void @called_from_invalid_bounds_1
; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
-; Kernel does not specify a minimum desired occupancy so will end up with the
-; default minimum (and the requested maximum).
-define amdgpu_kernel void @kernel_without_min_waves() #9 {
-; CHECK-LABEL: define amdgpu_kernel void @kernel_without_min_waves
+; Invalid range for amdgpu-waves-per-eu
+define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8
; CHECK-SAME: () #[[ATTR1]] {
-; CHECK-NEXT: call void @called_without_min_waves()
+; CHECK-NEXT: call void @called_from_invalid_bounds_0()
; CHECK-NEXT: ret void
;
- call void @called_without_min_waves()
+ call void @called_from_invalid_bounds_0()
ret void
}
@@ -248,10 +247,10 @@ define amdgpu_kernel void @kernel_without_min_waves() #9 {
define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 {
; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123
; CHECK-SAME: () #[[ATTR11:[0-9]+]] {
-; CHECK-NEXT: call void @called_from_invalid_bounds()
+; CHECK-NEXT: call void @called_from_invalid_bounds_1()
; CHECK-NEXT: ret void
;
- call void @called_from_invalid_bounds()
+ call void @called_from_invalid_bounds_1()
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll
index b14bb8c292ad7..9b992a35c3303 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll
@@ -575,5 +575,5 @@ define amdgpu_kernel void @use1024vgprs_codegen(ptr %p) #1281 {
ret void
}
-attributes #2561 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
-attributes #1281 = { nounwind "amdgpu-flat-work-group-size"="128,128" }
+attributes #2561 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="1" }
+attributes #1281 = { nounwind "amdgpu-flat-work-group-size"="128,128" "amdgpu-waves-per-eu"="1" }
diff --git a/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll b/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll
deleted file mode 100644
index 28c24f024455d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s
-
-; All kernels have the same value for amdgpu-flat-work-group-size, except the
-; second and third kernels explicitly set it. The first and second kernel also
-; have the same final waves/EU range, except the second kernel explicitly sets
-; it with amdgpu-waves-per-eu. As a result the first and second kernels are
-; treated identically.
-;
-; The third kernel hints the compiler that a maximum occupancy of 1 is desired
-; with amdgpu-waves-per-eu, so the alloca promotion pass is free to use more LDS
-; space than when limiting itself to support the maximum default occupancy of
-; 10. This does not break the ABI requirement to support the full possible range
-; of workgroup sizes as specified by amdgpu-flat-work-group-size.
-
-; CHECK-NOT: @no_attributes.stack
-; CHECK-NOT: @explicit_default_workgroup_size_and_waves.stack
-
-; CHECK-LABEL: @no_attributes(
-; CHECK: alloca [5 x i32]
-; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
-define amdgpu_kernel void @no_attributes(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
-entry:
- %stack = alloca [5 x i32], align 4, addrspace(5)
- %in_data0 = load i32, ptr addrspace(1) %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %in_data0
- store i32 4, ptr addrspace(5) %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
- %in_data1 = load i32, ptr addrspace(1) %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %in_data1
- store i32 5, ptr addrspace(5) %arrayidx3, align 4
- %out_data0 = load i32, ptr addrspace(5) %stack, align 4
- store i32 %out_data0, ptr addrspace(1) %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
- %out_data1 = load i32, ptr addrspace(5) %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
- store i32 %out_data1, ptr addrspace(1) %arrayidx13
- ret void
-}
-
-; CHECK-LABEL: @explicit_default_workgroup_size_and_waves(
-; CHECK: alloca [5 x i32]
-; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
-define amdgpu_kernel void @explicit_default_workgroup_size_and_waves(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
-entry:
- %stack = alloca [5 x i32], align 4, addrspace(5)
- %in_data0 = load i32, ptr addrspace(1) %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %in_data0
- store i32 4, ptr addrspace(5) %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
- %in_data1 = load i32, ptr addrspace(1) %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %in_data1
- store i32 5, ptr addrspace(5) %arrayidx3, align 4
- %out_data0 = load i32, ptr addrspace(5) %stack, align 4
- store i32 %out_data0, ptr addrspace(1) %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
- %out_data1 = load i32, ptr addrspace(5) %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
- store i32 %out_data1, ptr addrspace(1) %arrayidx13
- ret void
-}
-
-; CHECK-LABEL: @explicit_low_occupancy_requested(
-; CHECK-NOT: alloca [5 x i32]
-define amdgpu_kernel void @explicit_low_occupancy_requested(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
-entry:
- %stack = alloca [5 x i32], align 4, addrspace(5)
- %in_data0 = load i32, ptr addrspace(1) %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %in_data0
- store i32 4, ptr addrspace(5) %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
- %in_data1 = load i32, ptr addrspace(1) %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %in_data1
- store i32 5, ptr addrspace(5) %arrayidx3, align 4
- %out_data0 = load i32, ptr addrspace(5) %stack, align 4
- store i32 %out_data0, ptr addrspace(1) %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
- %out_data1 = load i32, ptr addrspace(5) %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
- store i32 %out_data1, ptr addrspace(1) %arrayidx13
- ret void
-}
-
-attributes #0 = { "amdgpu-waves-per-eu"="4,10" "amdgpu-flat-work-group-size"="1,1024" }
-attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
More information about the llvm-branch-commits
mailing list