[clang] [llvm] [AMDGPU][Clang] Allow amdgpu-waves-per-eu attribute to lower target occupancy range (PR #138284)
Lucas Ramirez via cfe-commits
cfe-commits at lists.llvm.org
Thu May 22 06:27:39 PDT 2025
https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/138284
>From 9906334cbabf160221dea11aa93e2cf7a154e6da Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 22 May 2025 13:26:57 +0000
Subject: [PATCH] Rebase on main (integrate changes from 1b34722)
---
clang/lib/CodeGen/Targets/AMDGPU.cpp | 27 ++++-----
clang/lib/Sema/SemaAMDGPU.cpp | 5 --
clang/test/SemaOpenCL/amdgpu-attrs.cl | 1 -
llvm/lib/IR/Verifier.cpp | 23 ++++++++
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 4 ++
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 46 ++++++++-------
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 +-
.../AMDGPU/attr-amdgpu-waves-per-eu.ll | 12 ++++
...-work-group-size-overrides-waves-per-eu.ll | 4 +-
...ine-scheduler-sink-trivial-remats-attr.mir | 12 ++--
.../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 56 ++++++++-----------
.../Verifier/AMDGPU/amdgpu-waves-per-eu.ll | 40 +++++++++++++
12 files changed, 153 insertions(+), 81 deletions(-)
create mode 100644 llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index c8921c434db47..8072d4f363feb 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -743,20 +743,21 @@ void CodeGenModule::handleAMDGPUWavesPerEUAttr(
llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
unsigned Min =
Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
- unsigned Max =
- Attr->getMax()
- ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
- : 0;
- if (Min != 0) {
- assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
-
- std::string AttrVal = llvm::utostr(Min);
- if (Max != 0)
- AttrVal = AttrVal + "," + llvm::utostr(Max);
- F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
- } else
- assert(Max == 0 && "Max must be zero");
+ if (Attr->getMax()) {
+ unsigned Max =
+ Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
+ assert(Min == 0 || (Min != 0 && Max != 0) &&
+ "Min must be non-zero when Max is non-zero");
+ assert(Min <= Max && "Min must be less than or equal to Max");
+ // Do not add the attribute if min,max=0,0.
+ if (Min != 0) {
+ std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
+ F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
+ }
+ } else if (Min != 0) {
+ F->addFnAttr("amdgpu-waves-per-eu", llvm::utostr(Min));
+ }
}
std::unique_ptr<TargetCodeGenInfo>
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index e6414a623b929..9ae3ec1289def 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -244,11 +244,6 @@ static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
if (MaxExpr && !S.checkUInt32Argument(Attr, MaxExpr, Max, 1))
return true;
- if (Min == 0 && Max != 0) {
- S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
- << &Attr << 0;
- return true;
- }
if (Max != 0 && Min > Max) {
S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
<< &Attr << 1;
diff --git a/clang/test/SemaOpenCL/amdgpu-attrs.cl b/clang/test/SemaOpenCL/amdgpu-attrs.cl
index 89ba3f86803c5..b9b44dff4d4a9 100644
--- a/clang/test/SemaOpenCL/amdgpu-attrs.cl
+++ b/clang/test/SemaOpenCL/amdgpu-attrs.cl
@@ -46,7 +46,6 @@ __attribute__((amdgpu_num_sgpr(4294967296))) kernel void kernel_num_sgpr_L() {}
__attribute__((amdgpu_num_vgpr(4294967296))) kernel void kernel_num_vgpr_L() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
__attribute__((amdgpu_flat_work_group_size(0, 64))) kernel void kernel_flat_work_group_size_0_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: max must be 0 since min is 0}}
-__attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: max must be 0 since min is 0}}
__attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void kernel_flat_work_group_size_64_32() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be greater than max}}
__attribute__((amdgpu_waves_per_eu(4, 2))) kernel void kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: min must not be greater than max}}
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 73b4274a41ee6..402af44e38cbf 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2473,6 +2473,29 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
CheckFailed("invalid value for 'denormal-fp-math-f32' attribute: " + S,
V);
}
+
+ if (TT.isAMDGPU()) {
+ if (auto A = Attrs.getFnAttr("amdgpu-waves-per-eu"); A.isValid()) {
+ std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
+ unsigned Min = 0;
+ StringRef MinStr = Strs.first.trim();
+ Check(!MinStr.getAsInteger(0, Min),
+ "minimum for 'amdgpu-waves-per-eu' must be integer: " + MinStr);
+ if (!Strs.second.empty()) {
+ unsigned Max = 0;
+ StringRef MaxStr = Strs.second.trim();
+ Check(!MaxStr.getAsInteger(0, Max),
+ "maximum for 'amdgpu-waves-per-eu' must be integer: " + MaxStr);
+ Check(Max, "maximum for 'amdgpu-waves-per-eu' must be non-zero");
+ Check(Min <= Max, "minimum must be less than or equal to maximum for "
+ "'amdgpu-waves-per-eu': " +
+ MinStr + " > " + MaxStr);
+ } else {
+ Check(Min, "minimum for 'amdgpu-waves-per-eu' must be non-zero when "
+ "maximum is not provided");
+ }
+ }
+ }
}
void Verifier::verifyFunctionMetadata(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index ee887447972bf..52540bac85a52 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1367,6 +1367,10 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
return Changed;
}
+// 14 ==> 15
+// 15 ==> 16
+// 16 ==> 17
+
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
AMDGPUAttributorOptions Options,
ThinOrFullLTOPhase LTOPhase) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d095fc6cf9549..1fcd0654fba31 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -156,15 +156,15 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
}
}
-std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
- const Function &F) const {
+std::pair<unsigned, unsigned>
+AMDGPUSubtarget::getFlatWorkGroupSizes(const Function &F) const {
// Default minimum/maximum flat work group sizes.
std::pair<unsigned, unsigned> Default =
- getDefaultFlatWorkGroupSize(F.getCallingConv());
+ getDefaultFlatWorkGroupSize(F.getCallingConv());
// Requested minimum/maximum flat work group sizes.
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
- F, "amdgpu-flat-work-group-size", Default);
+ F, "amdgpu-flat-work-group-size", Default);
// Make sure requested minimum is less than requested maximum.
if (Requested.first > Requested.second)
@@ -186,23 +186,29 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
// sizes limits the achievable maximum, and we aim to support enough waves per
// EU so that we can concurrently execute all waves of a single workgroup of
// maximum size on a CU.
- std::pair<unsigned, unsigned> Default = {
+ std::pair<unsigned, unsigned> WavesPerEU = {
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
- Default.first = std::min(Default.first, Default.second);
-
- // Make sure requested minimum is within the default range and lower than the
- // requested maximum. The latter must not violate target specification.
- if (RequestedWavesPerEU.first < Default.first ||
- RequestedWavesPerEU.first > Default.second ||
- RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
- RequestedWavesPerEU.second > getMaxWavesPerEU())
- return Default;
-
- // We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
- RequestedWavesPerEU.second =
- std::min(RequestedWavesPerEU.second, Default.second);
- return RequestedWavesPerEU;
+ WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
+
+ // Requested minimum must not violate subtarget's specifications and be no
+ // greater than maximum.
+ if (RequestedWavesPerEU.first &&
+ (RequestedWavesPerEU.first < getMinWavesPerEU() ||
+ RequestedWavesPerEU.first > RequestedWavesPerEU.second))
+ return WavesPerEU;
+ // Requested maximum must not violate subtarget's specifications.
+ if (RequestedWavesPerEU.second > getMaxWavesPerEU())
+ return WavesPerEU;
+
+ // A requested maximum may limit both the final minimum and maximum, but
+ // not increase them. A requested minimum can either decrease or increase the
+ // default minimum as long as it doesn't exceed the maximum.
+ WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second);
+ if (RequestedWavesPerEU.first)
+ WavesPerEU.first = RequestedWavesPerEU.first;
+ WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
+ return WavesPerEU;
}
std::pair<unsigned, unsigned>
@@ -229,7 +235,7 @@ std::pair<unsigned, unsigned>
AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
unsigned LDSBytes, const Function &F) const {
// Default minimum/maximum number of waves per execution unit.
- std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
+ std::pair<unsigned, unsigned> Default(0, getMaxWavesPerEU());
// Requested minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Requested =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 23826cc22250a..dafadaff97930 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -126,7 +126,9 @@ class AMDGPUSubtarget {
/// Returns the target minimum/maximum number of waves per EU. This is based
/// on the minimum/maximum number of \p RequestedWavesPerEU and further
/// limited by the maximum achievable occupancy derived from the range of \p
- /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
+ /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A
+ /// minimum requested waves/EU value of 0 indicates an intent to not restrict
+ /// the minimum target occupancy.
std::pair<unsigned, unsigned>
getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
std::pair<unsigned, unsigned> FlatWorkGroupSizes,
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index e9fe4f3c618c7..2ab38a9f12a52 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -225,3 +225,15 @@ entry:
ret void
}
attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"}
+
+; At most 2 waves per execution unit.
+; CHECK-LABEL: {{^}}empty_at_most_2:
+; CHECK: SGPRBlocks: 12
+; CHECK: VGPRBlocks: 21
+; CHECK: NumSGPRsForWavesPerEU: 102
+; CHECK: NumVGPRsForWavesPerEU: 85
+define amdgpu_kernel void @empty_at_most_2() #13 {
+entry:
+ ret void
+}
+attributes #13 = {"amdgpu-waves-per-eu"="0,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
index 67061bcb2a785..f8c7b593e6eba 100644
--- a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
@@ -57,5 +57,5 @@ entry:
ret void
}
-attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
-attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
+attributes #0 = { "amdgpu-waves-per-eu"="1" }
+attributes #1 = { "amdgpu-waves-per-eu"="1" "amdgpu-flat-work-group-size"="1,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
index f5558964d2707..6d34e045e55c5 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
@@ -1033,6 +1033,7 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
@@ -1060,9 +1061,8 @@ body: |
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[DEF]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: reduce_spill_archvgpr_above_addressable_limit
@@ -1660,6 +1660,7 @@ body: |
; GFX908: bb.0:
; GFX908-NEXT: successors: %bb.1(0x80000000)
; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
; GFX908-NEXT: [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
@@ -1944,9 +1945,8 @@ body: |
; GFX908-NEXT: S_NOP 0, implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]]
; GFX908-NEXT: S_NOP 0, implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]]
; GFX908-NEXT: S_NOP 0, implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]]
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index b87d266cc2514..5e1cae0760c36 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
-; Check propagation of amdgpu-flat-work-group-size attribute.
+; Check propagation of amdgpu-waves-per-eu attribute.
; Called from a single kernel with 1,8
define internal void @default_to_1_8_a() {
@@ -216,41 +216,30 @@ define internal i32 @bitcasted_function() {
ret i32 0
}
-define internal void @called_from_invalid_bounds_0() {
-; CHECK-LABEL: define internal void @called_from_invalid_bounds_0
-; CHECK-SAME: () #[[ATTR1]] {
-; CHECK-NEXT: ret void
-;
+define internal void @called_without_min_waves() {
ret void
}
-define internal void @called_from_invalid_bounds_1() {
-; CHECK-LABEL: define internal void @called_from_invalid_bounds_1
+define internal void @called_from_invalid_bounds() {
+; CHECK-LABEL: define internal void @called_from_invalid_bounds
; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
-; Invalid range for amdgpu-waves-per-eu
-define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 {
-; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8
-; CHECK-SAME: () #[[ATTR1]] {
-; CHECK-NEXT: call void @called_from_invalid_bounds_0()
-; CHECK-NEXT: ret void
-;
- call void @called_from_invalid_bounds_0()
+define internal void @called_from_invalid_bounds_1() {
+ call void @called_without_min_waves()
ret void
}
-
; Invalid range for amdgpu-waves-per-eu
define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 {
; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123
; CHECK-SAME: () #[[ATTR11:[0-9]+]] {
-; CHECK-NEXT: call void @called_from_invalid_bounds_1()
+; CHECK-NEXT: call void @called_from_invalid_bounds()
; CHECK-NEXT: ret void
;
- call void @called_from_invalid_bounds_1()
+ call void @called_from_invalid_bounds()
ret void
}
@@ -279,7 +268,7 @@ define amdgpu_kernel void @kernel_3_6() #12 {
; 3,6 -> 6,9
define internal void @refine_upper_func_3_6() #13 {
; CHECK-LABEL: define internal void @refine_upper_func_3_6
-; CHECK-SAME: () #[[ATTR9]] {
+; CHECK-SAME: () #[[ATTR14:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
@@ -288,7 +277,7 @@ define internal void @refine_upper_func_3_6() #13 {
; 4,8 -> 6,8
define internal void @refine_lower_func_4_8() #14 {
; CHECK-LABEL: define internal void @refine_lower_func_4_8
-; CHECK-SAME: () #[[ATTR14:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR15:[0-9]+]] {
; CHECK-NEXT: call void @refine_upper_func_3_6()
; CHECK-NEXT: ret void
;
@@ -298,7 +287,7 @@ define internal void @refine_lower_func_4_8() #14 {
define amdgpu_kernel void @kernel_foo_6_8() #15 {
; CHECK-LABEL: define amdgpu_kernel void @kernel_foo_6_8
-; CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR16:[0-9]+]] {
; CHECK-NEXT: call void @refine_upper_func_3_6()
; CHECK-NEXT: call void @refine_lower_func_4_8()
; CHECK-NEXT: call void @func_9_10_a()
@@ -313,7 +302,7 @@ define amdgpu_kernel void @kernel_foo_6_8() #15 {
; 5,5 -> 5,5
define internal void @func_5_5() #16 {
; CHECK-LABEL: define internal void @func_5_5
-; CHECK-SAME: () #[[ATTR16:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR17:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
@@ -322,7 +311,7 @@ define internal void @func_5_5() #16 {
; 5,8 -> 8,8
define internal void @func_5_8() #17 {
; CHECK-LABEL: define internal void @func_5_8
-; CHECK-SAME: () #[[ATTR17:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR18:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
@@ -331,7 +320,7 @@ define internal void @func_5_8() #17 {
; 9,10 -> 9,10
define internal void @func_9_10_a() #18 {
; CHECK-LABEL: define internal void @func_9_10_a
-; CHECK-SAME: () #[[ATTR18:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR19:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
@@ -340,7 +329,7 @@ define internal void @func_9_10_a() #18 {
; 9,10 -> 9,9
define internal void @func_9_10_b() #18 {
; CHECK-LABEL: define internal void @func_9_10_b
-; CHECK-SAME: () #[[ATTR18]] {
+; CHECK-SAME: () #[[ATTR19]] {
; CHECK-NEXT: ret void
;
ret void
@@ -348,7 +337,7 @@ define internal void @func_9_10_b() #18 {
define amdgpu_kernel void @kernel_bar_8_9() #19 {
; CHECK-LABEL: define amdgpu_kernel void @kernel_bar_8_9
-; CHECK-SAME: () #[[ATTR19:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR20:[0-9]+]] {
; CHECK-NEXT: call void @refine_upper_func_3_6()
; CHECK-NEXT: call void @func_5_5()
; CHECK-NEXT: call void @func_9_10_b()
@@ -413,10 +402,11 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll b/llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll
new file mode 100644
index 0000000000000..8686a4278147e
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll
@@ -0,0 +1,40 @@
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+define void @valid_amdgpu_waves_per_eu_range() "amdgpu-waves-per-eu"="2,4" {
+ ret void
+}
+
+define void @valid_amdgpu_waves_per_eu_min_only() "amdgpu-waves-per-eu"="2" {
+ ret void
+}
+
+define void @valid_amdgpu_waves_per_eu_max_only() "amdgpu-waves-per-eu"="0,4" {
+ ret void
+}
+
+; CHECK: minimum for 'amdgpu-waves-per-eu' must be integer: x
+define void @invalid_amdgpu_waves_per_eu_min_nan() "amdgpu-waves-per-eu"="x" {
+ ret void
+}
+
+; CHECK: maximum for 'amdgpu-waves-per-eu' must be integer: x
+define void @invalid_amdgpu_waves_per_eu_max_nan() "amdgpu-waves-per-eu"="0,x" {
+ ret void
+}
+
+; CHECK: minimum for 'amdgpu-waves-per-eu' must be non-zero when maximum is not provided
+define void @invalid_amdgpu_waves_per_eu_min_zero() "amdgpu-waves-per-eu"="0" {
+ ret void
+}
+
+; CHECK: maximum for 'amdgpu-waves-per-eu' must be non-zero
+define void @invalid_amdgpu_waves_per_eu_max_zero() "amdgpu-waves-per-eu"="2,0" {
+ ret void
+}
+
+; CHECK: minimum must be less than or equal to maximum for 'amdgpu-waves-per-eu': 2 > 1
+define void @invalid_amdgpu_waves_per_eu_max_lt_min() "amdgpu-waves-per-eu"="2,1" {
+ ret void
+}
More information about the cfe-commits
mailing list