[clang] [llvm] [AMDGPU][Clang] Allow amdgpu-waves-per-eu attribute to lower target occupancy range (PR #138284)

Thu May 22 06:27:39 PDT 2025

https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/138284

>From 9906334cbabf160221dea11aa93e2cf7a154e6da Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 22 May 2025 13:26:57 +0000
Subject: [PATCH] Rebase on main (integrate changes from 1b34722)

---
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 27 ++++-----
 clang/lib/Sema/SemaAMDGPU.cpp                 |  5 --
 clang/test/SemaOpenCL/amdgpu-attrs.cl         |  1 -
 llvm/lib/IR/Verifier.cpp                      | 23 ++++++++
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  4 ++
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    | 46 ++++++++-------
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  4 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        | 12 ++++
 ...-work-group-size-overrides-waves-per-eu.ll |  4 +-
 ...ine-scheduler-sink-trivial-remats-attr.mir | 12 ++--
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  | 56 ++++++++-----------
 .../Verifier/AMDGPU/amdgpu-waves-per-eu.ll    | 40 +++++++++++++
 12 files changed, 153 insertions(+), 81 deletions(-)
 create mode 100644 llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index c8921c434db47..8072d4f363feb 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -743,20 +743,21 @@ void CodeGenModule::handleAMDGPUWavesPerEUAttr(
     llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
   unsigned Min =
       Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
-  unsigned Max =
-      Attr->getMax()
-          ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
-          : 0;
 
-  if (Min != 0) {
-    assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
-
-    std::string AttrVal = llvm::utostr(Min);
-    if (Max != 0)
-      AttrVal = AttrVal + "," + llvm::utostr(Max);
-    F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
-  } else
-    assert(Max == 0 && "Max must be zero");
+  if (Attr->getMax()) {
+    unsigned Max =
+        Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
+    assert(Min == 0 || (Min != 0 && Max != 0) &&
+                           "Min must be non-zero when Max is non-zero");
+    assert(Min <= Max && "Min must be less than or equal to Max");
+    // Do not add the attribute if min,max=0,0.
+    if (Min != 0) {
+      std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
+      F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
+    }
+  } else if (Min != 0) {
+    F->addFnAttr("amdgpu-waves-per-eu", llvm::utostr(Min));
+  }
 }
 
 std::unique_ptr<TargetCodeGenInfo>
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index e6414a623b929..9ae3ec1289def 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -244,11 +244,6 @@ static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
   if (MaxExpr && !S.checkUInt32Argument(Attr, MaxExpr, Max, 1))
     return true;
 
-  if (Min == 0 && Max != 0) {
-    S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
-        << &Attr << 0;
-    return true;
-  }
   if (Max != 0 && Min > Max) {
     S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
         << &Attr << 1;
diff --git a/clang/test/SemaOpenCL/amdgpu-attrs.cl b/clang/test/SemaOpenCL/amdgpu-attrs.cl
index 89ba3f86803c5..b9b44dff4d4a9 100644
--- a/clang/test/SemaOpenCL/amdgpu-attrs.cl
+++ b/clang/test/SemaOpenCL/amdgpu-attrs.cl
@@ -46,7 +46,6 @@ __attribute__((amdgpu_num_sgpr(4294967296))) kernel void kernel_num_sgpr_L() {}
 __attribute__((amdgpu_num_vgpr(4294967296))) kernel void kernel_num_vgpr_L() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
 
 __attribute__((amdgpu_flat_work_group_size(0, 64))) kernel void kernel_flat_work_group_size_0_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: max must be 0 since min is 0}}
-__attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: max must be 0 since min is 0}}
 
 __attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void kernel_flat_work_group_size_64_32() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be greater than max}}
 __attribute__((amdgpu_waves_per_eu(4, 2))) kernel void kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: min must not be greater than max}}
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 73b4274a41ee6..402af44e38cbf 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2473,6 +2473,29 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
       CheckFailed("invalid value for 'denormal-fp-math-f32' attribute: " + S,
                   V);
   }
+
+  if (TT.isAMDGPU()) {
+    if (auto A = Attrs.getFnAttr("amdgpu-waves-per-eu"); A.isValid()) {
+      std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
+      unsigned Min = 0;
+      StringRef MinStr = Strs.first.trim();
+      Check(!MinStr.getAsInteger(0, Min),
+            "minimum for 'amdgpu-waves-per-eu' must be integer: " + MinStr);
+      if (!Strs.second.empty()) {
+        unsigned Max = 0;
+        StringRef MaxStr = Strs.second.trim();
+        Check(!MaxStr.getAsInteger(0, Max),
+              "maximum for 'amdgpu-waves-per-eu' must be integer: " + MaxStr);
+        Check(Max, "maximum for 'amdgpu-waves-per-eu' must be non-zero");
+        Check(Min <= Max, "minimum must be less than or equal to maximum for "
+                          "'amdgpu-waves-per-eu': " +
+                              MinStr + " > " + MaxStr);
+      } else {
+        Check(Min, "minimum for 'amdgpu-waves-per-eu' must be non-zero when "
+                   "maximum is not provided");
+      }
+    }
+  }
 }
 
 void Verifier::verifyFunctionMetadata(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index ee887447972bf..52540bac85a52 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1367,6 +1367,10 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
   return Changed;
 }
 
+// 14 ==> 15
+// 15 ==> 16
+// 16 ==> 17
+
 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
                     AMDGPUAttributorOptions Options,
                     ThinOrFullLTOPhase LTOPhase) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d095fc6cf9549..1fcd0654fba31 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -156,15 +156,15 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
   }
 }
 
-std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
-  const Function &F) const {
+std::pair<unsigned, unsigned>
+AMDGPUSubtarget::getFlatWorkGroupSizes(const Function &F) const {
   // Default minimum/maximum flat work group sizes.
   std::pair<unsigned, unsigned> Default =
-    getDefaultFlatWorkGroupSize(F.getCallingConv());
+      getDefaultFlatWorkGroupSize(F.getCallingConv());
 
   // Requested minimum/maximum flat work group sizes.
   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
-    F, "amdgpu-flat-work-group-size", Default);
+      F, "amdgpu-flat-work-group-size", Default);
 
   // Make sure requested minimum is less than requested maximum.
   if (Requested.first > Requested.second)
@@ -186,23 +186,29 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
   // sizes limits the achievable maximum, and we aim to support enough waves per
   // EU so that we can concurrently execute all waves of a single workgroup of
   // maximum size on a CU.
-  std::pair<unsigned, unsigned> Default = {
+  std::pair<unsigned, unsigned> WavesPerEU = {
       getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
       getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
-  Default.first = std::min(Default.first, Default.second);
-
-  // Make sure requested minimum is within the default range and lower than the
-  // requested maximum. The latter must not violate target specification.
-  if (RequestedWavesPerEU.first < Default.first ||
-      RequestedWavesPerEU.first > Default.second ||
-      RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
-      RequestedWavesPerEU.second > getMaxWavesPerEU())
-    return Default;
-
-  // We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
-  RequestedWavesPerEU.second =
-      std::min(RequestedWavesPerEU.second, Default.second);
-  return RequestedWavesPerEU;
+  WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
+
+  // Requested minimum must not violate subtarget's specifications and be no
+  // greater than maximum.
+  if (RequestedWavesPerEU.first &&
+      (RequestedWavesPerEU.first < getMinWavesPerEU() ||
+       RequestedWavesPerEU.first > RequestedWavesPerEU.second))
+    return WavesPerEU;
+  // Requested maximum must not violate subtarget's specifications.
+  if (RequestedWavesPerEU.second > getMaxWavesPerEU())
+    return WavesPerEU;
+
+  // A requested maximum may limit both the final minimum and maximum, but
+  // not increase them. A requested minimum can either decrease or increase the
+  // default minimum as long as it doesn't exceed the maximum.
+  WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second);
+  if (RequestedWavesPerEU.first)
+    WavesPerEU.first = RequestedWavesPerEU.first;
+  WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
+  return WavesPerEU;
 }
 
 std::pair<unsigned, unsigned>
@@ -229,7 +235,7 @@ std::pair<unsigned, unsigned>
 AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
                                unsigned LDSBytes, const Function &F) const {
   // Default minimum/maximum number of waves per execution unit.
-  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
+  std::pair<unsigned, unsigned> Default(0, getMaxWavesPerEU());
 
   // Requested minimum/maximum number of waves per execution unit.
   std::pair<unsigned, unsigned> Requested =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 23826cc22250a..dafadaff97930 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -126,7 +126,9 @@ class AMDGPUSubtarget {
   /// Returns the target minimum/maximum number of waves per EU. This is based
   /// on the minimum/maximum number of \p RequestedWavesPerEU and further
   /// limited by the maximum achievable occupancy derived from the range of \p
-  /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
+  /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A
+  /// minimum requested waves/EU value of 0 indicates an intent to not restrict
+  /// the minimum target occupancy.
   std::pair<unsigned, unsigned>
   getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
                          std::pair<unsigned, unsigned> FlatWorkGroupSizes,
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index e9fe4f3c618c7..2ab38a9f12a52 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -225,3 +225,15 @@ entry:
   ret void
 }
 attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"}
+
+; At most 2 waves per execution unit.
+; CHECK-LABEL: {{^}}empty_at_most_2:
+; CHECK: SGPRBlocks: 12
+; CHECK: VGPRBlocks: 21
+; CHECK: NumSGPRsForWavesPerEU: 102
+; CHECK: NumVGPRsForWavesPerEU: 85
+define amdgpu_kernel void @empty_at_most_2() #13 {
+entry:
+  ret void
+}
+attributes #13 = {"amdgpu-waves-per-eu"="0,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
index 67061bcb2a785..f8c7b593e6eba 100644
--- a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
@@ -57,5 +57,5 @@ entry:
   ret void
 }
 
-attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
-attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
+attributes #0 = { "amdgpu-waves-per-eu"="1" }
+attributes #1 = { "amdgpu-waves-per-eu"="1" "amdgpu-flat-work-group-size"="1,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
index f5558964d2707..6d34e045e55c5 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
@@ -1033,6 +1033,7 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
@@ -1060,9 +1061,8 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[DEF]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: reduce_spill_archvgpr_above_addressable_limit
@@ -1660,6 +1660,7 @@ body:             |
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
   ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
@@ -1944,9 +1945,8 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]]
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index b87d266cc2514..5e1cae0760c36 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
 
-; Check propagation of amdgpu-flat-work-group-size attribute.
+; Check propagation of amdgpu-waves-per-eu attribute.
 
 ; Called from a single kernel with 1,8
 define internal void @default_to_1_8_a() {
@@ -216,41 +216,30 @@ define internal i32 @bitcasted_function() {
   ret i32 0
 }
 
-define internal void @called_from_invalid_bounds_0() {
-; CHECK-LABEL: define internal void @called_from_invalid_bounds_0
-; CHECK-SAME: () #[[ATTR1]] {
-; CHECK-NEXT:    ret void
-;
+define internal void @called_without_min_waves() {
   ret void
 }
 
-define internal void @called_from_invalid_bounds_1() {
-; CHECK-LABEL: define internal void @called_from_invalid_bounds_1
+define internal void @called_from_invalid_bounds() {
+; CHECK-LABEL: define internal void @called_from_invalid_bounds
 ; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
 }
 
-; Invalid range for amdgpu-waves-per-eu
-define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 {
-; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8
-; CHECK-SAME: () #[[ATTR1]] {
-; CHECK-NEXT:    call void @called_from_invalid_bounds_0()
-; CHECK-NEXT:    ret void
-;
-  call void @called_from_invalid_bounds_0()
+define internal void @called_from_invalid_bounds_1() {
+  call void @called_without_min_waves()
   ret void
 }
-
 ; Invalid range for amdgpu-waves-per-eu
 define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123
 ; CHECK-SAME: () #[[ATTR11:[0-9]+]] {
-; CHECK-NEXT:    call void @called_from_invalid_bounds_1()
+; CHECK-NEXT:    call void @called_from_invalid_bounds()
 ; CHECK-NEXT:    ret void
 ;
-  call void @called_from_invalid_bounds_1()
+  call void @called_from_invalid_bounds()
   ret void
 }
 
@@ -279,7 +268,7 @@ define amdgpu_kernel void @kernel_3_6() #12 {
 ; 3,6 -> 6,9
 define internal void @refine_upper_func_3_6() #13 {
 ; CHECK-LABEL: define internal void @refine_upper_func_3_6
-; CHECK-SAME: () #[[ATTR9]] {
+; CHECK-SAME: () #[[ATTR14:[0-9]+]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
@@ -288,7 +277,7 @@ define internal void @refine_upper_func_3_6() #13 {
 ; 4,8 -> 6,8
 define internal void @refine_lower_func_4_8() #14 {
 ; CHECK-LABEL: define internal void @refine_lower_func_4_8
-; CHECK-SAME: () #[[ATTR14:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR15:[0-9]+]] {
 ; CHECK-NEXT:    call void @refine_upper_func_3_6()
 ; CHECK-NEXT:    ret void
 ;
@@ -298,7 +287,7 @@ define internal void @refine_lower_func_4_8() #14 {
 
 define amdgpu_kernel void @kernel_foo_6_8() #15 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_foo_6_8
-; CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR16:[0-9]+]] {
 ; CHECK-NEXT:    call void @refine_upper_func_3_6()
 ; CHECK-NEXT:    call void @refine_lower_func_4_8()
 ; CHECK-NEXT:    call void @func_9_10_a()
@@ -313,7 +302,7 @@ define amdgpu_kernel void @kernel_foo_6_8() #15 {
 ; 5,5 -> 5,5
 define internal void @func_5_5() #16 {
 ; CHECK-LABEL: define internal void @func_5_5
-; CHECK-SAME: () #[[ATTR16:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR17:[0-9]+]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
@@ -322,7 +311,7 @@ define internal void @func_5_5() #16 {
 ; 5,8 -> 8,8
 define internal void @func_5_8() #17 {
 ; CHECK-LABEL: define internal void @func_5_8
-; CHECK-SAME: () #[[ATTR17:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR18:[0-9]+]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
@@ -331,7 +320,7 @@ define internal void @func_5_8() #17 {
 ; 9,10 -> 9,10
 define internal void @func_9_10_a() #18 {
 ; CHECK-LABEL: define internal void @func_9_10_a
-; CHECK-SAME: () #[[ATTR18:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR19:[0-9]+]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
@@ -340,7 +329,7 @@ define internal void @func_9_10_a() #18 {
 ; 9,10 -> 9,9
 define internal void @func_9_10_b() #18 {
 ; CHECK-LABEL: define internal void @func_9_10_b
-; CHECK-SAME: () #[[ATTR18]] {
+; CHECK-SAME: () #[[ATTR19]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
@@ -348,7 +337,7 @@ define internal void @func_9_10_b() #18 {
 
 define amdgpu_kernel void @kernel_bar_8_9() #19 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_bar_8_9
-; CHECK-SAME: () #[[ATTR19:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR20:[0-9]+]] {
 ; CHECK-NEXT:    call void @refine_upper_func_3_6()
 ; CHECK-NEXT:    call void @func_5_5()
 ; CHECK-NEXT:    call void @func_9_10_b()
@@ -413,10 +402,11 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll b/llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll
new file mode 100644
index 0000000000000..8686a4278147e
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll
@@ -0,0 +1,40 @@
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+define void @valid_amdgpu_waves_per_eu_range() "amdgpu-waves-per-eu"="2,4" {
+  ret void
+}
+
+define void @valid_amdgpu_waves_per_eu_min_only() "amdgpu-waves-per-eu"="2" {
+  ret void
+}
+
+define void @valid_amdgpu_waves_per_eu_max_only() "amdgpu-waves-per-eu"="0,4" {
+  ret void
+}
+
+; CHECK: minimum for 'amdgpu-waves-per-eu' must be integer: x
+define void @invalid_amdgpu_waves_per_eu_min_nan() "amdgpu-waves-per-eu"="x" {
+  ret void
+}
+
+; CHECK: maximum for 'amdgpu-waves-per-eu' must be integer: x
+define void @invalid_amdgpu_waves_per_eu_max_nan() "amdgpu-waves-per-eu"="0,x" {
+  ret void
+}
+
+; CHECK: minimum for 'amdgpu-waves-per-eu' must be non-zero when maximum is not provided
+define void @invalid_amdgpu_waves_per_eu_min_zero() "amdgpu-waves-per-eu"="0" {
+  ret void
+}
+
+; CHECK: maximum for 'amdgpu-waves-per-eu' must be non-zero
+define void @invalid_amdgpu_waves_per_eu_max_zero() "amdgpu-waves-per-eu"="2,0" {
+  ret void
+}
+
+; CHECK: minimum must be less than or equal to maximum for 'amdgpu-waves-per-eu': 2 > 1
+define void @invalid_amdgpu_waves_per_eu_max_lt_min() "amdgpu-waves-per-eu"="2,1" {
+  ret void
+}