[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)

Jun Wang via cfe-commits cfe-commits at lists.llvm.org
Mon Feb 26 11:23:52 PST 2024


https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/79035

>From 5c088a59bd36df40bae9a3a712f3994feded359d Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 22 Jan 2024 12:43:27 -0600
Subject: [PATCH 01/11] [AMDGPU] Adding the amdgpu-num-work-groups function
 attribute

A new function attribute named amdgpu-num-work-groups is added.
This attribute, which consists of three integers, allows programmers
to let the compiler know the number of workgroups to be launched in
each of the three dimensions and do optimizations based on that
information.
---
 clang/include/clang/Basic/Attr.td             |  7 ++
 clang/include/clang/Basic/AttrDocs.td         | 24 +++++++
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 13 ++++
 clang/lib/Sema/SemaDeclAttr.cpp               | 22 +++++++
 ...a-attribute-supported-attributes-list.test |  1 +
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  8 +++
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |  5 ++
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  3 +
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  2 +
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +++
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    | 53 +++++++++++++++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 19 ++++++
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 65 +++++++++++++++++++
 13 files changed, 232 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 58838b01b4fd7c..1b4718258d91e6 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2031,6 +2031,13 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
+def AMDGPUNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
+  let Documentation = [AMDGPUNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
 def AMDGPUKernelCall : DeclOrTypeAttr {
   let Spellings = [Clang<"amdgpu_kernel">];
   let Documentation = [Undocumented];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index e02a1201e2ad79..e8fd10587a8022 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2705,6 +2705,30 @@ An error will be given if:
   }];
 }
 
+def AMDGPUNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+The number of work groups specifies the number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+AMDGPU target. This attribute may be attached to a kernel function definition
+and is an optimization hint.
+
+``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
+Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
   let Content = [{
 Clang supports several different calling conventions, depending on the target
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 03ac6b78598fc8..93321efd26462c 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -356,6 +356,19 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
+    uint32_t X = Attr->getNumWorkGroupsX();
+    uint32_t Y = Attr->getNumWorkGroupsY();
+    uint32_t Z = Attr->getNumWorkGroupsZ();
+
+    if (X != 0 && Y != 0 && Z != 0) {
+      std::string AttrVal = llvm::utostr(X) + std::string(", ") +
+                            llvm::utostr(Y) + std::string(", ") +
+                            llvm::utostr(Z);
+      F->addFnAttr("amdgpu-num-work-groups", AttrVal);
+    }
+  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 069571fcf78641..98d1726bb3e0b8 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,6 +8069,25 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
+static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
+                                          const ParsedAttr &AL) {
+  uint32_t NumWGX = 0;
+  uint32_t NumWGY = 0;
+  uint32_t NumWGZ = 0;
+  Expr *NumWGXExpr = AL.getArgAsExpr(0);
+  Expr *NumWGYExpr = AL.getArgAsExpr(1);
+  Expr *NumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
+    return;
+  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
+    return;
+  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
+    return;
+
+  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
+                                                       NumWGY, NumWGZ));
+}
+
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
                                               const ParsedAttr &AL) {
   // If we try to apply it to a function pointer, don't warn, but don't
@@ -9173,6 +9192,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_AMDGPUNumWorkGroups:
+    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
     break;
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index e476c15b35ded9..3d12656612eb06 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -6,6 +6,7 @@
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 186fa58524b9f8..4ee48c6fe79088 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,6 +494,14 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
+  unsigned NumWGX = MFI.getNumWorkGroupsX();
+  unsigned NumWGY = MFI.getNumWorkGroupsY();
+  unsigned NumWGZ = MFI.getNumWorkGroupsZ();
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
+    Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
+    Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
+    Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
+  }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
   Kern[".vgpr_spill_count"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index bcc7dedf322969..4cdf61cf904984 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1108,3 +1108,8 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
 }
+
+SmallVector<unsigned>
+AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index b72697973be7a1..90c394b6e3b252 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,6 +288,9 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
+  /// Return the number of work groups for the function.
+  SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
+
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b94d143a75e5ed..fec8650f01d766 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,6 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
+  NumWorkGroups = ST.getNumWorkGroups(F);
+  assert(NumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0336ec4985ea74..18ceb282a85db2 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
+  // Default/requested number of work groups for the function.
+  SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
+
 private:
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
@@ -1072,6 +1075,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   // \returns true if a function needs or may need AGPRs.
   bool usesAGPRs(const MachineFunction &MF) const;
+
+  /// \returns Default/requested number of work groups for this function.
+  SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
+
+  unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
+  unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
+  unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 106fdb19f27895..398d78341e41cd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUAsmUtils.h"
 #include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
@@ -1253,6 +1254,58 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
   return Ints;
 }
 
+SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
+                                             unsigned Size) {
+  assert(Size > 2);
+  SmallVector<unsigned> Default(Size, 0);
+
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  SmallVector<unsigned> Vals(Size, 0);
+
+  LLVMContext &Ctx = F.getContext();
+
+  StringRef S = A.getValueAsString();
+  unsigned i = 0;
+  for (; !S.empty() && i < Size; i++) {
+    std::pair<StringRef, StringRef> Strs = S.split(',');
+    unsigned IntVal;
+    if (Strs.first.trim().getAsInteger(0, IntVal)) {
+      Ctx.emitError("can't parse integer attribute " + Strs.first + " in " +
+                    Name);
+      return Default;
+    }
+    Vals[i] = IntVal;
+    S = Strs.second;
+  }
+
+  if (!S.empty() || i < Size) {
+    Ctx.emitError("attribute " + Name +
+                  " has incorrect number of integers; expected " +
+                  llvm::utostr(Size));
+    return Default;
+  }
+  return Vals;
+}
+
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
+                                     unsigned Default) {
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  LLVMContext &Ctx = F.getContext();
+  unsigned IntVal = Default;
+  StringRef Str = A.getValueAsString();
+  if (Str.trim().getAsInteger(0, IntVal)) {
+    Ctx.emitError("can't parse integer attribute " + Name);
+    return Default;
+  }
+  return IntVal;
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 11b0bc5c81711e..92a4ab71c2f055 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -814,6 +814,15 @@ bool shouldEmitConstantsToTextSection(const Triple &TT);
 /// to integer.
 int getIntegerAttribute(const Function &F, StringRef Name, int Default);
 
+/// \returns Unsigned Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
+                                     unsigned Default);
+
 /// \returns A pair of integer values requested using \p F's \p Name attribute
 /// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
 /// is false).
@@ -828,6 +837,16 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
                         std::pair<unsigned, unsigned> Default,
                         bool OnlyFirstRequired = false);
 
+/// \returns Generate a vector of integer values requested using \p F's \p Name
+/// attribute.
+///
+/// \returns true if exactly Size (>2) number of integers are found in the
+/// attribute.
+///
+/// \returns false if any error occurs.
+SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
+                                             unsigned Size);
+
 /// Represents the counter values to wait for in an s_waitcnt instruction.
 ///
 /// Large values (including the maximum possible integer) can be used to
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
new file mode 100644
index 00000000000000..6fc6de91d1d030
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -0,0 +1,65 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+
+; Attribute not specified.
+; CHECK-LABEL: {{^}}empty_no_attribute:
+define amdgpu_kernel void @empty_no_attribute() {
+entry:
+  ret void
+}
+
+; Ignore if number of work groups for x dimension is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
+define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
+
+; Ignore if number of work groups for y dimension is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
+define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
+
+; Ignore if number of work groups for z dimension is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
+define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
+
+; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
+entry:
+  ret void
+}
+attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
+entry:
+  ret void
+}
+attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
+
+
+; CHECK: .amdgpu_metadata
+; CHECK:        .name:           empty_no_attribute
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_x0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_y0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_z0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_1_2_3
+; CHECK-NEXT:   .num_work_groups_x: 1
+; CHECK-NEXT:   .num_work_groups_y: 2
+; CHECK-NEXT:   .num_work_groups_z: 3
+; CHECK:        .name:           empty_num_work_groups_1024_1024_1024
+; CHECK-NEXT:   .num_work_groups_x: 1024
+; CHECK-NEXT:   .num_work_groups_y: 1024
+; CHECK-NEXT:   .num_work_groups_z: 1024

>From c4e460b39c5a50870929a041a89026b1135f6e0a Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Sun, 4 Feb 2024 18:23:06 -0600
Subject: [PATCH 02/11] Support 2 attributes: one for min and one for max
 number of work groups.

---
 clang/include/clang/Basic/Attr.td             |  15 +-
 clang/include/clang/Basic/AttrDocs.td         |  30 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  57 +++-
 clang/lib/Sema/SemaDeclAttr.cpp               |  52 +++-
 ...a-attribute-supported-attributes-list.test |   3 +-
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  40 ++-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |   8 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |   7 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |   6 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  16 +-
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 292 ++++++++++++++++--
 11 files changed, 444 insertions(+), 82 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 1b4718258d91e6..4044b7e1748af7 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2031,10 +2031,17 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
-def AMDGPUNumWorkGroups : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
-  let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
-  let Documentation = [AMDGPUNumWorkGroupsDocs];
+def AMDGPUMinNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_min_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"MinNumWorkGroupsX">, UnsignedArgument<"MinNumWorkGroupsY">, UnsignedArgument<"MinNumWorkGroupsZ">];
+  let Documentation = [AMDGPUMinNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
+def AMDGPUMaxNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"MaxNumWorkGroupsX">, UnsignedArgument<"MaxNumWorkGroupsY">, UnsignedArgument<"MaxNumWorkGroupsZ">];
+  let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index e8fd10587a8022..65c5e66f963fad 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2705,14 +2705,38 @@ An error will be given if:
   }];
 }
 
-def AMDGPUNumWorkGroupsDocs : Documentation {
+def AMDGPUMinNumWorkGroupsDocs : Documentation {
   let Category = DocCatAMDGPUAttributes;
   let Content = [{
-The number of work groups specifies the number of work groups when the kernel
+The min number of work groups specifies the min number of work groups when the kernel
 is dispatched.
 
 Clang supports the
-``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+AMDGPU target. This attribute may be attached to a kernel function definition
+and is an optimization hint.
+
+``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
+Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
+def AMDGPUMaxNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+The max number of work groups specifies the max number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
 AMDGPU target. This attribute may be attached to a kernel function definition
 and is an optimization hint.
 
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 93321efd26462c..d9c1807aa39705 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -357,18 +357,55 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
 
-  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
-    uint32_t X = Attr->getNumWorkGroupsX();
-    uint32_t Y = Attr->getNumWorkGroupsY();
-    uint32_t Z = Attr->getNumWorkGroupsZ();
-
-    if (X != 0 && Y != 0 && Z != 0) {
-      std::string AttrVal = llvm::utostr(X) + std::string(", ") +
-                            llvm::utostr(Y) + std::string(", ") +
-                            llvm::utostr(Z);
-      F->addFnAttr("amdgpu-num-work-groups", AttrVal);
+  uint32_t MinWGX = 0;
+  uint32_t MinWGY = 0;
+  uint32_t MinWGZ = 0;
+
+  uint32_t MaxWGX = 0;
+  uint32_t MaxWGY = 0;
+  uint32_t MaxWGZ = 0;
+
+  bool IsMinNumWGValid = false;
+  bool IsMaxNumWGValid = false;
+
+  if (const auto *Attr = FD->getAttr<AMDGPUMinNumWorkGroupsAttr>()) {
+    MinWGX = Attr->getMinNumWorkGroupsX();
+    MinWGY = Attr->getMinNumWorkGroupsY();
+    MinWGZ = Attr->getMinNumWorkGroupsZ();
+
+    if (MinWGX != 0 && MinWGY != 0 && MinWGZ != 0)
+      IsMinNumWGValid = true;
+  }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
+    MaxWGX = Attr->getMaxNumWorkGroupsX();
+    MaxWGY = Attr->getMaxNumWorkGroupsY();
+    MaxWGZ = Attr->getMaxNumWorkGroupsZ();
+
+    if (MaxWGX != 0 && MaxWGY != 0 && MaxWGZ != 0)
+      IsMaxNumWGValid = true;
+  }
+
+  if (IsMinNumWGValid && IsMaxNumWGValid) {
+    if (MinWGX > MaxWGX || MinWGY > MaxWGY || MinWGZ > MaxWGZ) {
+      IsMinNumWGValid = false;
+      IsMaxNumWGValid = false;
     }
   }
+
+  if (IsMinNumWGValid) {
+    std::string AttrVal = llvm::utostr(MinWGX) + std::string(", ") +
+                          llvm::utostr(MinWGY) + std::string(", ") +
+                          llvm::utostr(MinWGZ);
+    F->addFnAttr("amdgpu-min-num-work-groups", AttrVal);
+  }
+
+  if (IsMaxNumWGValid) {
+    std::string AttrVal = llvm::utostr(MaxWGX) + std::string(", ") +
+                          llvm::utostr(MaxWGY) + std::string(", ") +
+                          llvm::utostr(MaxWGZ);
+    F->addFnAttr("amdgpu-max-num-work-groups", AttrVal);
+  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 98d1726bb3e0b8..964bde03e31d4b 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,23 +8069,42 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
-static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
-                                          const ParsedAttr &AL) {
-  uint32_t NumWGX = 0;
-  uint32_t NumWGY = 0;
-  uint32_t NumWGZ = 0;
-  Expr *NumWGXExpr = AL.getArgAsExpr(0);
-  Expr *NumWGYExpr = AL.getArgAsExpr(1);
-  Expr *NumWGZExpr = AL.getArgAsExpr(2);
-  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
+static void handleAMDGPUMinNumWorkGroupsAttr(Sema &S, Decl *D,
+                                             const ParsedAttr &AL) {
+  uint32_t MinNumWGX = 0;
+  uint32_t MinNumWGY = 0;
+  uint32_t MinNumWGZ = 0;
+  Expr *MinNumWGXExpr = AL.getArgAsExpr(0);
+  Expr *MinNumWGYExpr = AL.getArgAsExpr(1);
+  Expr *MinNumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, MinNumWGXExpr, MinNumWGX))
+    return;
+  if (!checkUInt32Argument(S, AL, MinNumWGYExpr, MinNumWGY))
+    return;
+  if (!checkUInt32Argument(S, AL, MinNumWGZExpr, MinNumWGZ))
+    return;
+
+  D->addAttr(::new (S.Context) AMDGPUMinNumWorkGroupsAttr(
+      S.Context, AL, MinNumWGX, MinNumWGY, MinNumWGZ));
+}
+
+static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
+                                             const ParsedAttr &AL) {
+  uint32_t MaxNumWGX = 0;
+  uint32_t MaxNumWGY = 0;
+  uint32_t MaxNumWGZ = 0;
+  Expr *MaxNumWGXExpr = AL.getArgAsExpr(0);
+  Expr *MaxNumWGYExpr = AL.getArgAsExpr(1);
+  Expr *MaxNumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, MaxNumWGXExpr, MaxNumWGX))
     return;
-  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
+  if (!checkUInt32Argument(S, AL, MaxNumWGYExpr, MaxNumWGY))
     return;
-  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
+  if (!checkUInt32Argument(S, AL, MaxNumWGZExpr, MaxNumWGZ))
     return;
 
-  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
-                                                       NumWGY, NumWGZ));
+  D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
+      S.Context, AL, MaxNumWGX, MaxNumWGY, MaxNumWGZ));
 }
 
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
@@ -9192,8 +9211,11 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
-  case ParsedAttr::AT_AMDGPUNumWorkGroups:
-    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+  case ParsedAttr::AT_AMDGPUMinNumWorkGroups:
+    handleAMDGPUMinNumWorkGroupsAttr(S, D, AL);
+    break;
+  case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
+    handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
     break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 3d12656612eb06..82d8340b565ce9 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -4,9 +4,10 @@
 
 // CHECK: #pragma clang attribute supports the following attributes:
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUMinNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
-// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 4ee48c6fe79088..aaf334adea1cf9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,13 +494,39 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
-  unsigned NumWGX = MFI.getNumWorkGroupsX();
-  unsigned NumWGY = MFI.getNumWorkGroupsY();
-  unsigned NumWGZ = MFI.getNumWorkGroupsZ();
-  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
-    Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
-    Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
-    Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
+
+  unsigned MinNumWGX = MFI.getMinNumWorkGroupsX();
+  unsigned MinNumWGY = MFI.getMinNumWorkGroupsY();
+  unsigned MinNumWGZ = MFI.getMinNumWorkGroupsZ();
+
+  unsigned MaxNumWGX = MFI.getMaxNumWorkGroupsX();
+  unsigned MaxNumWGY = MFI.getMaxNumWorkGroupsY();
+  unsigned MaxNumWGZ = MFI.getMaxNumWorkGroupsZ();
+
+  bool IsMinNumWGValid = false;
+  bool IsMaxNumWGValid = false;
+  if (MinNumWGX != 0 && MinNumWGY != 0 && MinNumWGZ != 0)
+    IsMinNumWGValid = true;
+  if (MaxNumWGX != 0 && MaxNumWGY != 0 && MaxNumWGZ != 0)
+    IsMaxNumWGValid = true;
+  if (IsMinNumWGValid && IsMaxNumWGValid) {
+    if (MaxNumWGX < MinNumWGX || MaxNumWGY < MinNumWGY ||
+        MaxNumWGZ < MinNumWGZ) {
+      IsMinNumWGValid = false;
+      IsMaxNumWGValid = false;
+    }
+  }
+
+  if (IsMinNumWGValid) {
+    Kern[".min_num_work_groups_x"] = Kern.getDocument()->getNode(MinNumWGX);
+    Kern[".min_num_work_groups_y"] = Kern.getDocument()->getNode(MinNumWGY);
+    Kern[".min_num_work_groups_z"] = Kern.getDocument()->getNode(MinNumWGZ);
+  }
+
+  if (IsMaxNumWGValid) {
+    Kern[".max_num_work_groups_x"] = Kern.getDocument()->getNode(MaxNumWGX);
+    Kern[".max_num_work_groups_y"] = Kern.getDocument()->getNode(MaxNumWGY);
+    Kern[".max_num_work_groups_z"] = Kern.getDocument()->getNode(MaxNumWGZ);
   }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 4cdf61cf904984..6c9f0900e96f60 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1110,6 +1110,10 @@ unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
 }
 
 SmallVector<unsigned>
-AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
-  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
+AMDGPUSubtarget::getMinNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-min-num-work-groups", 3);
+}
+SmallVector<unsigned>
+AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-work-groups", 3);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 90c394b6e3b252..23fd75d13f199c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,8 +288,11 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
-  /// Return the number of work groups for the function.
-  SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
+  /// Return the min number of work groups for the function.
+  SmallVector<unsigned> getMinNumWorkGroups(const Function &F) const;
+
+  /// Return the max number of work groups for the function.
+  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
 
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index fec8650f01d766..152d8a1ee6ec08 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,8 +46,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
-  NumWorkGroups = ST.getNumWorkGroups(F);
-  assert(NumWorkGroups.size() == 3);
+  MinNumWorkGroups = ST.getMinNumWorkGroups(F);
+  assert(MinNumWorkGroups.size() == 3);
+  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
+  assert(MaxNumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 18ceb282a85db2..35f43a6d35b948 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -427,7 +427,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
   // Default/requested number of work groups for the function.
-  SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
+  SmallVector<unsigned> MinNumWorkGroups = {0, 0, 0};
+  SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
 
 private:
   unsigned NumUserSGPRs = 0;
@@ -1077,11 +1078,16 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   bool usesAGPRs(const MachineFunction &MF) const;
 
   /// \returns Default/requested number of work groups for this function.
-  SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
+  SmallVector<unsigned> getMinNumWorkGroups() const { return MinNumWorkGroups; }
+  SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
 
-  unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
-  unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
-  unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
+  unsigned getMinNumWorkGroupsX() const { return MinNumWorkGroups[0]; }
+  unsigned getMinNumWorkGroupsY() const { return MinNumWorkGroups[1]; }
+  unsigned getMinNumWorkGroupsZ() const { return MinNumWorkGroups[2]; }
+
+  unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
+  unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
+  unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
index 6fc6de91d1d030..eec7819526238d 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -8,58 +8,288 @@ entry:
 }
 
 ; Ignore if number of work groups for x dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
-define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
+; CHECK-LABEL: {{^}}empty_min_num_work_groups_x0:
+define amdgpu_kernel void @empty_min_num_work_groups_x0() #0 {
 entry:
   ret void
 }
-attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
+attributes #0 = {"amdgpu-min-num-work-groups"="0,2,3"}
+
+; Ignore if number of work groups for x dimension is 0.
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_x0:
+define amdgpu_kernel void @empty_max_num_work_groups_x0() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-max-num-work-groups"="0,2,3"}
+
+; Ignore if number of work groups for y dimension is 0.
+; CHECK-LABEL: {{^}}empty_min_num_work_groups_y0:
+define amdgpu_kernel void @empty_min_num_work_groups_y0() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-min-num-work-groups"="1,0,3"}
 
 ; Ignore if number of work groups for y dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
-define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_y0:
+define amdgpu_kernel void @empty_max_num_work_groups_y0() #3 {
 entry:
   ret void
 }
-attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
+attributes #3 = {"amdgpu-max-num-work-groups"="1,0,3"}
 
 ; Ignore if number of work groups for z dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
-define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
+; CHECK-LABEL: {{^}}empty_min_num_work_groups_z0:
+define amdgpu_kernel void @empty_min_num_work_groups_z0() #4 {
+entry:
+  ret void
+}
+attributes #4 = {"amdgpu-min-num-work-groups"="1,2,0"}
+
+; Ignore if number of work groups for z dimension is 0.
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_z0:
+define amdgpu_kernel void @empty_max_num_work_groups_z0() #5 {
+entry:
+  ret void
+}
+attributes #5 = {"amdgpu-max-num-work-groups"="1,2,0"}
+
+
+
+; CHECK-LABEL: {{^}}empty_min_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_min_num_work_groups_1_2_3() #20 {
+entry:
+  ret void
+}
+attributes #20 = {"amdgpu-min-num-work-groups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_max_num_work_groups_1_2_3() #21 {
+entry:
+  ret void
+}
+attributes #21 = {"amdgpu-max-num-work-groups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_min_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_min_num_work_groups_1024_1024_1024() #22 {
+entry:
+  ret void
+}
+attributes #22 = {"amdgpu-min-num-work-groups"="1024,1024,1024"}
+
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_max_num_work_groups_1024_1024_1024() #23 {
+entry:
+  ret void
+}
+attributes #23 = {"amdgpu-max-num-work-groups"="1024,1024,1024"}
+
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_min:
+define amdgpu_kernel void @empty_min_max_num_work_groups_bad_min() #30 {
+entry:
+  ret void
+}
+attributes #30 = {"amdgpu-min-num-work-groups"="0,2,3" "amdgpu-max-num-work-groups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_max:
+define amdgpu_kernel void @empty_min_max_num_work_groups_bad_max() #31 {
+entry:
+  ret void
+}
+attributes #31 = {"amdgpu-min-num-work-groups"="1,2,3" "amdgpu-max-num-work-groups"="0,2,3"}
+
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_x:
+define amdgpu_kernel void @empty_min_max_num_work_groups_bad_x() #40 {
+entry:
+  ret void
+}
+attributes #40 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="1,3,4"}
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_y:
+define amdgpu_kernel void @empty_min_max_num_work_groups_bad_y() #41 {
 entry:
   ret void
 }
-attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
+attributes #41 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,1,4"}
 
-; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
-define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_z:
+define amdgpu_kernel void @empty_min_max_num_work_groups_bad_z() #42 {
 entry:
   ret void
 }
-attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
+attributes #42 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,3,1"}
 
-; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
-define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_equal1:
+define amdgpu_kernel void @empty_min_max_num_work_groups_equal1() #50 {
 entry:
   ret void
 }
-attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
+attributes #50 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,3,4"}
+
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal1:
+define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal1() #60 {
+entry:
+  ret void
+}
+attributes #60 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,30,40"}
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal2:
+define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal2() #61 {
+entry:
+  ret void
+}
+attributes #61 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,3,40"}
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal3:
+define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal3() #62 {
+entry:
+  ret void
+}
+attributes #62 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,30,4"}
+
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater1:
+define amdgpu_kernel void @empty_min_max_num_work_groups_greater1() #62 {
+entry:
+  ret void
+}
+attributes #62 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,30,40"}
 
 
 ; CHECK: .amdgpu_metadata
-; CHECK:        .name:           empty_no_attribute
-; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_x0
-; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_y0
-; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_z0
-; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_1_2_3
-; CHECK-NEXT:   .num_work_groups_x: 1
-; CHECK-NEXT:   .num_work_groups_y: 2
-; CHECK-NEXT:   .num_work_groups_z: 3
-; CHECK:        .name:           empty_num_work_groups_1024_1024_1024
-; CHECK-NEXT:   .num_work_groups_x: 1024
-; CHECK-NEXT:   .num_work_groups_y: 1024
-; CHECK-NEXT:   .num_work_groups_z: 1024
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_no_attribute
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_num_work_groups_x0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_x0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_num_work_groups_y0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_y0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_num_work_groups_z0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_z0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .min_num_work_groups_x: 1
+; CHECK-NEXT:   .min_num_work_groups_y: 2
+; CHECK-NEXT:   .min_num_work_groups_z: 3
+; CHECK:        .name:           empty_min_num_work_groups_1_2_3
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 1
+; CHECK-NEXT:   .max_num_work_groups_y: 2
+; CHECK-NEXT:   .max_num_work_groups_z: 3
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_1_2_3
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .min_num_work_groups_x: 1024
+; CHECK-NEXT:   .min_num_work_groups_y: 1024
+; CHECK-NEXT:   .min_num_work_groups_z: 1024
+; CHECK-NEXT:   .name:           empty_min_num_work_groups_1024_1024_1024
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 1024
+; CHECK-NEXT:   .max_num_work_groups_y: 1024
+; CHECK-NEXT:   .max_num_work_groups_z: 1024
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_1024_1024_1024
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 1
+; CHECK-NEXT:   .max_num_work_groups_y: 2
+; CHECK-NEXT:   .max_num_work_groups_z: 3
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_min
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .min_num_work_groups_x: 1
+; CHECK-NEXT:   .min_num_work_groups_y: 2
+; CHECK-NEXT:   .min_num_work_groups_z: 3
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_max
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_x
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_y
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_z
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 2
+; CHECK-NEXT:   .max_num_work_groups_y: 3
+; CHECK-NEXT:   .max_num_work_groups_z: 4
+; CHECK-NEXT:   .min_num_work_groups_x: 2
+; CHECK-NEXT:   .min_num_work_groups_y: 3
+; CHECK-NEXT:   .min_num_work_groups_z: 4
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_equal1
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 2
+; CHECK-NEXT:   .max_num_work_groups_y: 30
+; CHECK-NEXT:   .max_num_work_groups_z: 40
+; CHECK-NEXT:   .min_num_work_groups_x: 2
+; CHECK-NEXT:   .min_num_work_groups_y: 3
+; CHECK-NEXT:   .min_num_work_groups_z: 4
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal1
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 20
+; CHECK-NEXT:   .max_num_work_groups_y: 3
+; CHECK-NEXT:   .max_num_work_groups_z: 40
+; CHECK-NEXT:   .min_num_work_groups_x: 2
+; CHECK-NEXT:   .min_num_work_groups_y: 3
+; CHECK-NEXT:   .min_num_work_groups_z: 4
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal2
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 20
+; CHECK-NEXT:   .max_num_work_groups_y: 30
+; CHECK-NEXT:   .max_num_work_groups_z: 4
+; CHECK-NEXT:   .min_num_work_groups_x: 2
+; CHECK-NEXT:   .min_num_work_groups_y: 3
+; CHECK-NEXT:   .min_num_work_groups_z: 4
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal3
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 20
+; CHECK-NEXT:   .max_num_work_groups_y: 30
+; CHECK-NEXT:   .max_num_work_groups_z: 40
+; CHECK-NEXT:   .min_num_work_groups_x: 2
+; CHECK-NEXT:   .min_num_work_groups_y: 3
+; CHECK-NEXT:   .min_num_work_groups_z: 4
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater1

>From 5485ab8007b733811f7f3d3e4ec1cfc518c7aae6 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 9 Feb 2024 15:57:07 -0600
Subject: [PATCH 03/11] Revert "Support 2 attributes: one for min and one for
 max number of work groups."

This reverts commit c4e460b39c5a50870929a041a89026b1135f6e0a.
---
 clang/include/clang/Basic/Attr.td             |  15 +-
 clang/include/clang/Basic/AttrDocs.td         |  30 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  57 +---
 clang/lib/Sema/SemaDeclAttr.cpp               |  52 +---
 ...a-attribute-supported-attributes-list.test |   3 +-
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  40 +--
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |   8 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |   7 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |   6 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  16 +-
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 292 ++----------------
 11 files changed, 82 insertions(+), 444 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 4044b7e1748af7..1b4718258d91e6 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2031,17 +2031,10 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
-def AMDGPUMinNumWorkGroups : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_min_num_work_groups", 0>];
-  let Args = [UnsignedArgument<"MinNumWorkGroupsX">, UnsignedArgument<"MinNumWorkGroupsY">, UnsignedArgument<"MinNumWorkGroupsZ">];
-  let Documentation = [AMDGPUMinNumWorkGroupsDocs];
-  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
-}
-
-def AMDGPUMaxNumWorkGroups : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
-  let Args = [UnsignedArgument<"MaxNumWorkGroupsX">, UnsignedArgument<"MaxNumWorkGroupsY">, UnsignedArgument<"MaxNumWorkGroupsZ">];
-  let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
+def AMDGPUNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
+  let Documentation = [AMDGPUNumWorkGroupsDocs];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 65c5e66f963fad..e8fd10587a8022 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2705,38 +2705,14 @@ An error will be given if:
   }];
 }
 
-def AMDGPUMinNumWorkGroupsDocs : Documentation {
+def AMDGPUNumWorkGroupsDocs : Documentation {
   let Category = DocCatAMDGPUAttributes;
   let Content = [{
-The min number of work groups specifies the min number of work groups when the kernel
+The number of work groups specifies the number of work groups when the kernel
 is dispatched.
 
 Clang supports the
-``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
-AMDGPU target. This attribute may be attached to a kernel function definition
-and is an optimization hint.
-
-``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
-Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
-
-If specified, the AMDGPU target backend might be able to produce better machine
-code.
-
-An error will be given if:
-  - Specified values violate subtarget specifications;
-  - Specified values are not compatible with values provided through other
-    attributes.
-  }];
-}
-
-def AMDGPUMaxNumWorkGroupsDocs : Documentation {
-  let Category = DocCatAMDGPUAttributes;
-  let Content = [{
-The max number of work groups specifies the max number of work groups when the kernel
-is dispatched.
-
-Clang supports the
-``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
 AMDGPU target. This attribute may be attached to a kernel function definition
 and is an optimization hint.
 
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index d9c1807aa39705..93321efd26462c 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -357,55 +357,18 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
 
-  uint32_t MinWGX = 0;
-  uint32_t MinWGY = 0;
-  uint32_t MinWGZ = 0;
-
-  uint32_t MaxWGX = 0;
-  uint32_t MaxWGY = 0;
-  uint32_t MaxWGZ = 0;
-
-  bool IsMinNumWGValid = false;
-  bool IsMaxNumWGValid = false;
-
-  if (const auto *Attr = FD->getAttr<AMDGPUMinNumWorkGroupsAttr>()) {
-    MinWGX = Attr->getMinNumWorkGroupsX();
-    MinWGY = Attr->getMinNumWorkGroupsY();
-    MinWGZ = Attr->getMinNumWorkGroupsZ();
-
-    if (MinWGX != 0 && MinWGY != 0 && MinWGZ != 0)
-      IsMinNumWGValid = true;
-  }
-
-  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
-    MaxWGX = Attr->getMaxNumWorkGroupsX();
-    MaxWGY = Attr->getMaxNumWorkGroupsY();
-    MaxWGZ = Attr->getMaxNumWorkGroupsZ();
-
-    if (MaxWGX != 0 && MaxWGY != 0 && MaxWGZ != 0)
-      IsMaxNumWGValid = true;
-  }
-
-  if (IsMinNumWGValid && IsMaxNumWGValid) {
-    if (MinWGX > MaxWGX || MinWGY > MaxWGY || MinWGZ > MaxWGZ) {
-      IsMinNumWGValid = false;
-      IsMaxNumWGValid = false;
+  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
+    uint32_t X = Attr->getNumWorkGroupsX();
+    uint32_t Y = Attr->getNumWorkGroupsY();
+    uint32_t Z = Attr->getNumWorkGroupsZ();
+
+    if (X != 0 && Y != 0 && Z != 0) {
+      std::string AttrVal = llvm::utostr(X) + std::string(", ") +
+                            llvm::utostr(Y) + std::string(", ") +
+                            llvm::utostr(Z);
+      F->addFnAttr("amdgpu-num-work-groups", AttrVal);
     }
   }
-
-  if (IsMinNumWGValid) {
-    std::string AttrVal = llvm::utostr(MinWGX) + std::string(", ") +
-                          llvm::utostr(MinWGY) + std::string(", ") +
-                          llvm::utostr(MinWGZ);
-    F->addFnAttr("amdgpu-min-num-work-groups", AttrVal);
-  }
-
-  if (IsMaxNumWGValid) {
-    std::string AttrVal = llvm::utostr(MaxWGX) + std::string(", ") +
-                          llvm::utostr(MaxWGY) + std::string(", ") +
-                          llvm::utostr(MaxWGZ);
-    F->addFnAttr("amdgpu-max-num-work-groups", AttrVal);
-  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 964bde03e31d4b..98d1726bb3e0b8 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,42 +8069,23 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
-static void handleAMDGPUMinNumWorkGroupsAttr(Sema &S, Decl *D,
-                                             const ParsedAttr &AL) {
-  uint32_t MinNumWGX = 0;
-  uint32_t MinNumWGY = 0;
-  uint32_t MinNumWGZ = 0;
-  Expr *MinNumWGXExpr = AL.getArgAsExpr(0);
-  Expr *MinNumWGYExpr = AL.getArgAsExpr(1);
-  Expr *MinNumWGZExpr = AL.getArgAsExpr(2);
-  if (!checkUInt32Argument(S, AL, MinNumWGXExpr, MinNumWGX))
-    return;
-  if (!checkUInt32Argument(S, AL, MinNumWGYExpr, MinNumWGY))
-    return;
-  if (!checkUInt32Argument(S, AL, MinNumWGZExpr, MinNumWGZ))
-    return;
-
-  D->addAttr(::new (S.Context) AMDGPUMinNumWorkGroupsAttr(
-      S.Context, AL, MinNumWGX, MinNumWGY, MinNumWGZ));
-}
-
-static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
-                                             const ParsedAttr &AL) {
-  uint32_t MaxNumWGX = 0;
-  uint32_t MaxNumWGY = 0;
-  uint32_t MaxNumWGZ = 0;
-  Expr *MaxNumWGXExpr = AL.getArgAsExpr(0);
-  Expr *MaxNumWGYExpr = AL.getArgAsExpr(1);
-  Expr *MaxNumWGZExpr = AL.getArgAsExpr(2);
-  if (!checkUInt32Argument(S, AL, MaxNumWGXExpr, MaxNumWGX))
+static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
+                                          const ParsedAttr &AL) {
+  uint32_t NumWGX = 0;
+  uint32_t NumWGY = 0;
+  uint32_t NumWGZ = 0;
+  Expr *NumWGXExpr = AL.getArgAsExpr(0);
+  Expr *NumWGYExpr = AL.getArgAsExpr(1);
+  Expr *NumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
     return;
-  if (!checkUInt32Argument(S, AL, MaxNumWGYExpr, MaxNumWGY))
+  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
     return;
-  if (!checkUInt32Argument(S, AL, MaxNumWGZExpr, MaxNumWGZ))
+  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
     return;
 
-  D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
-      S.Context, AL, MaxNumWGX, MaxNumWGY, MaxNumWGZ));
+  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
+                                                       NumWGY, NumWGZ));
 }
 
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
@@ -9211,11 +9192,8 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
-  case ParsedAttr::AT_AMDGPUMinNumWorkGroups:
-    handleAMDGPUMinNumWorkGroupsAttr(S, D, AL);
-    break;
-  case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
-    handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
+  case ParsedAttr::AT_AMDGPUNumWorkGroups:
+    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
     break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 82d8340b565ce9..3d12656612eb06 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -4,10 +4,9 @@
 
 // CHECK: #pragma clang attribute supports the following attributes:
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
-// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
-// CHECK-NEXT: AMDGPUMinNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index aaf334adea1cf9..4ee48c6fe79088 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,39 +494,13 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
-
-  unsigned MinNumWGX = MFI.getMinNumWorkGroupsX();
-  unsigned MinNumWGY = MFI.getMinNumWorkGroupsY();
-  unsigned MinNumWGZ = MFI.getMinNumWorkGroupsZ();
-
-  unsigned MaxNumWGX = MFI.getMaxNumWorkGroupsX();
-  unsigned MaxNumWGY = MFI.getMaxNumWorkGroupsY();
-  unsigned MaxNumWGZ = MFI.getMaxNumWorkGroupsZ();
-
-  bool IsMinNumWGValid = false;
-  bool IsMaxNumWGValid = false;
-  if (MinNumWGX != 0 && MinNumWGY != 0 && MinNumWGZ != 0)
-    IsMinNumWGValid = true;
-  if (MaxNumWGX != 0 && MaxNumWGY != 0 && MaxNumWGZ != 0)
-    IsMaxNumWGValid = true;
-  if (IsMinNumWGValid && IsMaxNumWGValid) {
-    if (MaxNumWGX < MinNumWGX || MaxNumWGY < MinNumWGY ||
-        MaxNumWGZ < MinNumWGZ) {
-      IsMinNumWGValid = false;
-      IsMaxNumWGValid = false;
-    }
-  }
-
-  if (IsMinNumWGValid) {
-    Kern[".min_num_work_groups_x"] = Kern.getDocument()->getNode(MinNumWGX);
-    Kern[".min_num_work_groups_y"] = Kern.getDocument()->getNode(MinNumWGY);
-    Kern[".min_num_work_groups_z"] = Kern.getDocument()->getNode(MinNumWGZ);
-  }
-
-  if (IsMaxNumWGValid) {
-    Kern[".max_num_work_groups_x"] = Kern.getDocument()->getNode(MaxNumWGX);
-    Kern[".max_num_work_groups_y"] = Kern.getDocument()->getNode(MaxNumWGY);
-    Kern[".max_num_work_groups_z"] = Kern.getDocument()->getNode(MaxNumWGZ);
+  unsigned NumWGX = MFI.getNumWorkGroupsX();
+  unsigned NumWGY = MFI.getNumWorkGroupsY();
+  unsigned NumWGZ = MFI.getNumWorkGroupsZ();
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
+    Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
+    Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
+    Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
   }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 6c9f0900e96f60..4cdf61cf904984 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1110,10 +1110,6 @@ unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
 }
 
 SmallVector<unsigned>
-AMDGPUSubtarget::getMinNumWorkGroups(const Function &F) const {
-  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-min-num-work-groups", 3);
-}
-SmallVector<unsigned>
-AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
-  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-work-groups", 3);
+AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 23fd75d13f199c..90c394b6e3b252 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,11 +288,8 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
-  /// Return the min number of work groups for the function.
-  SmallVector<unsigned> getMinNumWorkGroups(const Function &F) const;
-
-  /// Return the max number of work groups for the function.
-  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
+  /// Return the number of work groups for the function.
+  SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
 
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 152d8a1ee6ec08..fec8650f01d766 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,10 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
-  MinNumWorkGroups = ST.getMinNumWorkGroups(F);
-  assert(MinNumWorkGroups.size() == 3);
-  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
-  assert(MaxNumWorkGroups.size() == 3);
+  NumWorkGroups = ST.getNumWorkGroups(F);
+  assert(NumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 35f43a6d35b948..18ceb282a85db2 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -427,8 +427,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
   // Default/requested number of work groups for the function.
-  SmallVector<unsigned> MinNumWorkGroups = {0, 0, 0};
-  SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
+  SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
 
 private:
   unsigned NumUserSGPRs = 0;
@@ -1078,16 +1077,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   bool usesAGPRs(const MachineFunction &MF) const;
 
   /// \returns Default/requested number of work groups for this function.
-  SmallVector<unsigned> getMinNumWorkGroups() const { return MinNumWorkGroups; }
-  SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
+  SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
 
-  unsigned getMinNumWorkGroupsX() const { return MinNumWorkGroups[0]; }
-  unsigned getMinNumWorkGroupsY() const { return MinNumWorkGroups[1]; }
-  unsigned getMinNumWorkGroupsZ() const { return MinNumWorkGroups[2]; }
-
-  unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
-  unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
-  unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
+  unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
+  unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
+  unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
index eec7819526238d..6fc6de91d1d030 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -8,288 +8,58 @@ entry:
 }
 
 ; Ignore if number of work groups for x dimension is 0.
-; CHECK-LABEL: {{^}}empty_min_num_work_groups_x0:
-define amdgpu_kernel void @empty_min_num_work_groups_x0() #0 {
+; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
+define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
 entry:
   ret void
 }
-attributes #0 = {"amdgpu-min-num-work-groups"="0,2,3"}
-
-; Ignore if number of work groups for x dimension is 0.
-; CHECK-LABEL: {{^}}empty_max_num_work_groups_x0:
-define amdgpu_kernel void @empty_max_num_work_groups_x0() #1 {
-entry:
-  ret void
-}
-attributes #1 = {"amdgpu-max-num-work-groups"="0,2,3"}
-
-; Ignore if number of work groups for y dimension is 0.
-; CHECK-LABEL: {{^}}empty_min_num_work_groups_y0:
-define amdgpu_kernel void @empty_min_num_work_groups_y0() #2 {
-entry:
-  ret void
-}
-attributes #2 = {"amdgpu-min-num-work-groups"="1,0,3"}
+attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
 
 ; Ignore if number of work groups for y dimension is 0.
-; CHECK-LABEL: {{^}}empty_max_num_work_groups_y0:
-define amdgpu_kernel void @empty_max_num_work_groups_y0() #3 {
+; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
+define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
 entry:
   ret void
 }
-attributes #3 = {"amdgpu-max-num-work-groups"="1,0,3"}
+attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
 
 ; Ignore if number of work groups for z dimension is 0.
-; CHECK-LABEL: {{^}}empty_min_num_work_groups_z0:
-define amdgpu_kernel void @empty_min_num_work_groups_z0() #4 {
-entry:
-  ret void
-}
-attributes #4 = {"amdgpu-min-num-work-groups"="1,2,0"}
-
-; Ignore if number of work groups for z dimension is 0.
-; CHECK-LABEL: {{^}}empty_max_num_work_groups_z0:
-define amdgpu_kernel void @empty_max_num_work_groups_z0() #5 {
-entry:
-  ret void
-}
-attributes #5 = {"amdgpu-max-num-work-groups"="1,2,0"}
-
-
-
-; CHECK-LABEL: {{^}}empty_min_num_work_groups_1_2_3:
-define amdgpu_kernel void @empty_min_num_work_groups_1_2_3() #20 {
-entry:
-  ret void
-}
-attributes #20 = {"amdgpu-min-num-work-groups"="1,2,3"}
-
-; CHECK-LABEL: {{^}}empty_max_num_work_groups_1_2_3:
-define amdgpu_kernel void @empty_max_num_work_groups_1_2_3() #21 {
-entry:
-  ret void
-}
-attributes #21 = {"amdgpu-max-num-work-groups"="1,2,3"}
-
-; CHECK-LABEL: {{^}}empty_min_num_work_groups_1024_1024_1024:
-define amdgpu_kernel void @empty_min_num_work_groups_1024_1024_1024() #22 {
-entry:
-  ret void
-}
-attributes #22 = {"amdgpu-min-num-work-groups"="1024,1024,1024"}
-
-; CHECK-LABEL: {{^}}empty_max_num_work_groups_1024_1024_1024:
-define amdgpu_kernel void @empty_max_num_work_groups_1024_1024_1024() #23 {
-entry:
-  ret void
-}
-attributes #23 = {"amdgpu-max-num-work-groups"="1024,1024,1024"}
-
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_min:
-define amdgpu_kernel void @empty_min_max_num_work_groups_bad_min() #30 {
-entry:
-  ret void
-}
-attributes #30 = {"amdgpu-min-num-work-groups"="0,2,3" "amdgpu-max-num-work-groups"="1,2,3"}
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_max:
-define amdgpu_kernel void @empty_min_max_num_work_groups_bad_max() #31 {
-entry:
-  ret void
-}
-attributes #31 = {"amdgpu-min-num-work-groups"="1,2,3" "amdgpu-max-num-work-groups"="0,2,3"}
-
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_x:
-define amdgpu_kernel void @empty_min_max_num_work_groups_bad_x() #40 {
-entry:
-  ret void
-}
-attributes #40 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="1,3,4"}
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_y:
-define amdgpu_kernel void @empty_min_max_num_work_groups_bad_y() #41 {
+; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
+define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
 entry:
   ret void
 }
-attributes #41 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,1,4"}
+attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
 
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_z:
-define amdgpu_kernel void @empty_min_max_num_work_groups_bad_z() #42 {
+; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
 entry:
   ret void
 }
-attributes #42 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,3,1"}
+attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
 
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_equal1:
-define amdgpu_kernel void @empty_min_max_num_work_groups_equal1() #50 {
+; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
 entry:
   ret void
 }
-attributes #50 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,3,4"}
-
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal1:
-define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal1() #60 {
-entry:
-  ret void
-}
-attributes #60 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,30,40"}
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal2:
-define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal2() #61 {
-entry:
-  ret void
-}
-attributes #61 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,3,40"}
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal3:
-define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal3() #62 {
-entry:
-  ret void
-}
-attributes #62 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,30,4"}
-
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater1:
-define amdgpu_kernel void @empty_min_max_num_work_groups_greater1() #62 {
-entry:
-  ret void
-}
-attributes #62 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,30,40"}
+attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
 
 
 ; CHECK: .amdgpu_metadata
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_no_attribute
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_num_work_groups_x0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_max_num_work_groups_x0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_num_work_groups_y0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_max_num_work_groups_y0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_num_work_groups_z0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_max_num_work_groups_z0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .min_num_work_groups_x: 1
-; CHECK-NEXT:   .min_num_work_groups_y: 2
-; CHECK-NEXT:   .min_num_work_groups_z: 3
-; CHECK:        .name:           empty_min_num_work_groups_1_2_3
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 1
-; CHECK-NEXT:   .max_num_work_groups_y: 2
-; CHECK-NEXT:   .max_num_work_groups_z: 3
-; CHECK-NEXT:   .name:           empty_max_num_work_groups_1_2_3
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .min_num_work_groups_x: 1024
-; CHECK-NEXT:   .min_num_work_groups_y: 1024
-; CHECK-NEXT:   .min_num_work_groups_z: 1024
-; CHECK-NEXT:   .name:           empty_min_num_work_groups_1024_1024_1024
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 1024
-; CHECK-NEXT:   .max_num_work_groups_y: 1024
-; CHECK-NEXT:   .max_num_work_groups_z: 1024
-; CHECK-NEXT:   .name:           empty_max_num_work_groups_1024_1024_1024
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 1
-; CHECK-NEXT:   .max_num_work_groups_y: 2
-; CHECK-NEXT:   .max_num_work_groups_z: 3
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_min
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .min_num_work_groups_x: 1
-; CHECK-NEXT:   .min_num_work_groups_y: 2
-; CHECK-NEXT:   .min_num_work_groups_z: 3
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_max
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_x
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_y
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_z
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 2
-; CHECK-NEXT:   .max_num_work_groups_y: 3
-; CHECK-NEXT:   .max_num_work_groups_z: 4
-; CHECK-NEXT:   .min_num_work_groups_x: 2
-; CHECK-NEXT:   .min_num_work_groups_y: 3
-; CHECK-NEXT:   .min_num_work_groups_z: 4
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_equal1
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 2
-; CHECK-NEXT:   .max_num_work_groups_y: 30
-; CHECK-NEXT:   .max_num_work_groups_z: 40
-; CHECK-NEXT:   .min_num_work_groups_x: 2
-; CHECK-NEXT:   .min_num_work_groups_y: 3
-; CHECK-NEXT:   .min_num_work_groups_z: 4
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal1
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 20
-; CHECK-NEXT:   .max_num_work_groups_y: 3
-; CHECK-NEXT:   .max_num_work_groups_z: 40
-; CHECK-NEXT:   .min_num_work_groups_x: 2
-; CHECK-NEXT:   .min_num_work_groups_y: 3
-; CHECK-NEXT:   .min_num_work_groups_z: 4
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal2
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 20
-; CHECK-NEXT:   .max_num_work_groups_y: 30
-; CHECK-NEXT:   .max_num_work_groups_z: 4
-; CHECK-NEXT:   .min_num_work_groups_x: 2
-; CHECK-NEXT:   .min_num_work_groups_y: 3
-; CHECK-NEXT:   .min_num_work_groups_z: 4
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal3
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 20
-; CHECK-NEXT:   .max_num_work_groups_y: 30
-; CHECK-NEXT:   .max_num_work_groups_z: 40
-; CHECK-NEXT:   .min_num_work_groups_x: 2
-; CHECK-NEXT:   .min_num_work_groups_y: 3
-; CHECK-NEXT:   .min_num_work_groups_z: 4
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater1
+; CHECK:        .name:           empty_no_attribute
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_x0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_y0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_z0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_1_2_3
+; CHECK-NEXT:   .num_work_groups_x: 1
+; CHECK-NEXT:   .num_work_groups_y: 2
+; CHECK-NEXT:   .num_work_groups_z: 3
+; CHECK:        .name:           empty_num_work_groups_1024_1024_1024
+; CHECK-NEXT:   .num_work_groups_x: 1024
+; CHECK-NEXT:   .num_work_groups_y: 1024
+; CHECK-NEXT:   .num_work_groups_z: 1024

>From 09d4867a59099b2fa5ea204e001f49918887d651 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 9 Feb 2024 17:00:12 -0600
Subject: [PATCH 04/11] Minor change to the test file

---
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 31 +++++++++++++++----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
index 6fc6de91d1d030..8b1ee07972c86a 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -47,19 +47,38 @@ attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
 
 
 ; CHECK: .amdgpu_metadata
-; CHECK:        .name:           empty_no_attribute
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_no_attribute
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_x0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_num_work_groups_x0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_y0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_num_work_groups_y0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_z0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_num_work_groups_z0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_1_2_3
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_num_work_groups_1_2_3
 ; CHECK-NEXT:   .num_work_groups_x: 1
 ; CHECK-NEXT:   .num_work_groups_y: 2
 ; CHECK-NEXT:   .num_work_groups_z: 3
-; CHECK:        .name:           empty_num_work_groups_1024_1024_1024
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_num_work_groups_1024_1024_1024
 ; CHECK-NEXT:   .num_work_groups_x: 1024
 ; CHECK-NEXT:   .num_work_groups_y: 1024
 ; CHECK-NEXT:   .num_work_groups_z: 1024
+; CHECK-NEXT:   .private_segment_fixed_size: 0

>From 8c2b9775ff89b2ea0dc619afd9397bbf7dac4ee9 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 16 Feb 2024 18:48:36 -0600
Subject: [PATCH 05/11] Update based on code review.

---
 clang/include/clang/Basic/Attr.td             |  8 +--
 clang/include/clang/Basic/AttrDocs.td         | 13 ++---
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 21 ++++----
 clang/lib/Sema/SemaDeclAttr.cpp               | 11 ++--
 ...a-attribute-supported-attributes-list.test |  2 +-
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      | 12 ++---
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |  4 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  2 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  4 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 ++--
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 54 +++++++++----------
 11 files changed, 71 insertions(+), 70 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 1b4718258d91e6..059aa79e08f00b 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2031,10 +2031,10 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
-def AMDGPUNumWorkGroups : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
-  let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
-  let Documentation = [AMDGPUNumWorkGroupsDocs];
+def AMDGPUMaxNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
+  let Args = [ExprArgument<"MaxNumWorkGroupsX">, ExprArgument<"MaxNumWorkGroupsY">, ExprArgument<"MaxNumWorkGroupsZ">];
+  let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index e8fd10587a8022..f9bea390736b4b 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2705,18 +2705,19 @@ An error will be given if:
   }];
 }
 
-def AMDGPUNumWorkGroupsDocs : Documentation {
+def AMDGPUMaxNumWorkGroupsDocs : Documentation {
   let Category = DocCatAMDGPUAttributes;
   let Content = [{
-The number of work groups specifies the number of work groups when the kernel
+This attribute specifies the max number of work groups when the kernel
 is dispatched.
 
 Clang supports the
-``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
-AMDGPU target. This attribute may be attached to a kernel function definition
-and is an optimization hint.
+``__attribute__((amdgpu_max_num_work_groups(<x>, <y>, <z>)))`` or
+``[[clang::amdgpu_max_num_work_groups(<x>, <y>, <z>)]]`` attribute for the
+AMDGPU target. This attribute may be attached to HIP or OpenCL kernel function
+definitions and is an optimization hint.
 
-``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
+``<x>`` parameter specifies the maximum number of work groups in the x dimension.
 Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
 
 If specified, the AMDGPU target backend might be able to produce better machine
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 93321efd26462c..7bff652f1392b6 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -357,17 +357,16 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
 
-  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
-    uint32_t X = Attr->getNumWorkGroupsX();
-    uint32_t Y = Attr->getNumWorkGroupsY();
-    uint32_t Z = Attr->getNumWorkGroupsZ();
-
-    if (X != 0 && Y != 0 && Z != 0) {
-      std::string AttrVal = llvm::utostr(X) + std::string(", ") +
-                            llvm::utostr(Y) + std::string(", ") +
-                            llvm::utostr(Z);
-      F->addFnAttr("amdgpu-num-work-groups", AttrVal);
-    }
+  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
+    uint32_t X = Attr->getMaxNumWorkGroupsX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+    uint32_t Y = Attr->getMaxNumWorkGroupsY()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+    uint32_t Z = Attr->getMaxNumWorkGroupsZ()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+
+    llvm::SmallString<32> AttrVal;
+    llvm::raw_svector_ostream OS(AttrVal);
+    OS << X << ", " << Y << ", " << Z;
+     
+    F->addFnAttr("amdgpu-max-num-work-groups", AttrVal.str());
   }
 }
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 98d1726bb3e0b8..a38391bda1d1ab 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,7 +8069,7 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
-static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
+static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
                                           const ParsedAttr &AL) {
   uint32_t NumWGX = 0;
   uint32_t NumWGY = 0;
@@ -8084,8 +8084,9 @@ static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
   if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
     return;
 
-  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
-                                                       NumWGY, NumWGZ));
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ !=0)
+    D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(S.Context, AL, NumWGXExpr,
+                                                       NumWGYExpr, NumWGZExpr));
 }
 
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
@@ -9192,8 +9193,8 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
-  case ParsedAttr::AT_AMDGPUNumWorkGroups:
-    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+  case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
+    handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
     break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 3d12656612eb06..217f50fa6adc21 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -4,9 +4,9 @@
 
 // CHECK: #pragma clang attribute supports the following attributes:
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
-// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 4ee48c6fe79088..ba933574e3a0cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,13 +494,13 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
-  unsigned NumWGX = MFI.getNumWorkGroupsX();
-  unsigned NumWGY = MFI.getNumWorkGroupsY();
-  unsigned NumWGZ = MFI.getNumWorkGroupsZ();
+  unsigned NumWGX = MFI.getMaxNumWorkGroupsX();
+  unsigned NumWGY = MFI.getMaxNumWorkGroupsY();
+  unsigned NumWGZ = MFI.getMaxNumWorkGroupsZ();
   if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
-    Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
-    Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
-    Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
+    Kern[".max_num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
+    Kern[".max_num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
+    Kern[".max_num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
   }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 4cdf61cf904984..cb7592ef83451b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1110,6 +1110,6 @@ unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
 }
 
 SmallVector<unsigned>
-AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
-  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
+AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-work-groups", 3);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 90c394b6e3b252..e2d8b5d1ce9790 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -289,7 +289,7 @@ class AMDGPUSubtarget {
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
   /// Return the number of work groups for the function.
-  SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
+  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
 
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index fec8650f01d766..d09b5f5d6f32ac 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,8 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
-  NumWorkGroups = ST.getNumWorkGroups(F);
-  assert(NumWorkGroups.size() == 3);
+  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
+  assert(MaxNumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 18ceb282a85db2..7d0c1ba8448e6c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -427,7 +427,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
   // Default/requested number of work groups for the function.
-  SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
+  SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
 
 private:
   unsigned NumUserSGPRs = 0;
@@ -1077,11 +1077,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   bool usesAGPRs(const MachineFunction &MF) const;
 
   /// \returns Default/requested number of work groups for this function.
-  SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
+  SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
 
-  unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
-  unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
-  unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
+  unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
+  unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
+  unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
index 8b1ee07972c86a..1ec932fb227341 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
 
 ; Attribute not specified.
 ; CHECK-LABEL: {{^}}empty_no_attribute:
@@ -8,42 +8,42 @@ entry:
 }
 
 ; Ignore if number of work groups for x dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
-define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_x0:
+define amdgpu_kernel void @empty_max_num_work_groups_x0() #0 {
 entry:
   ret void
 }
-attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
+attributes #0 = {"amdgpu-max-num-work-groups"="0,2,3"}
 
 ; Ignore if number of work groups for y dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
-define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_y0:
+define amdgpu_kernel void @empty_max_num_work_groups_y0() #1 {
 entry:
   ret void
 }
-attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
+attributes #1 = {"amdgpu-max-num-work-groups"="1,0,3"}
 
 ; Ignore if number of work groups for z dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
-define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_z0:
+define amdgpu_kernel void @empty_max_num_work_groups_z0() #2 {
 entry:
   ret void
 }
-attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
+attributes #2 = {"amdgpu-max-num-work-groups"="1,2,0"}
 
-; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
-define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_max_num_work_groups_1_2_3() #3 {
 entry:
   ret void
 }
-attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
+attributes #3 = {"amdgpu-max-num-work-groups"="1,2,3"}
 
-; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
-define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_max_num_work_groups_1024_1024_1024() #4 {
 entry:
   ret void
 }
-attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
+attributes #4 = {"amdgpu-max-num-work-groups"="1024,1024,1024"}
 
 
 ; CHECK: .amdgpu_metadata
@@ -54,31 +54,31 @@ attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
 
 ; CHECK: - .args:
 ; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_num_work_groups_x0
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_x0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
 
 ; CHECK: - .args:
 ; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_num_work_groups_y0
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_y0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
 
 ; CHECK: - .args:
 ; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_num_work_groups_z0
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_z0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
 
 ; CHECK: - .args:
 ; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_num_work_groups_1_2_3
-; CHECK-NEXT:   .num_work_groups_x: 1
-; CHECK-NEXT:   .num_work_groups_y: 2
-; CHECK-NEXT:   .num_work_groups_z: 3
+; CHECK-NEXT:   .max_num_work_groups_x: 1
+; CHECK-NEXT:   .max_num_work_groups_y: 2
+; CHECK-NEXT:   .max_num_work_groups_z: 3
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_1_2_3
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
 
 ; CHECK: - .args:
 ; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_num_work_groups_1024_1024_1024
-; CHECK-NEXT:   .num_work_groups_x: 1024
-; CHECK-NEXT:   .num_work_groups_y: 1024
-; CHECK-NEXT:   .num_work_groups_z: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 1024
+; CHECK-NEXT:   .max_num_work_groups_y: 1024
+; CHECK-NEXT:   .max_num_work_groups_z: 1024
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_1024_1024_1024
 ; CHECK-NEXT:   .private_segment_fixed_size: 0

>From 88d3609adbef550345c4a4640422e356484a5937 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 16 Feb 2024 19:13:57 -0600
Subject: [PATCH 06/11] Code formatting.

---
 clang/lib/CodeGen/Targets/AMDGPU.cpp | 14 ++++++++++----
 clang/lib/Sema/SemaDeclAttr.cpp      |  8 ++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 7bff652f1392b6..c1d8baa94c3edd 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -358,14 +358,20 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
   }
 
   if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
-    uint32_t X = Attr->getMaxNumWorkGroupsX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
-    uint32_t Y = Attr->getMaxNumWorkGroupsY()->EvaluateKnownConstInt(M.getContext()).getExtValue();
-    uint32_t Z = Attr->getMaxNumWorkGroupsZ()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+    uint32_t X = Attr->getMaxNumWorkGroupsX()
+                     ->EvaluateKnownConstInt(M.getContext())
+                     .getExtValue();
+    uint32_t Y = Attr->getMaxNumWorkGroupsY()
+                     ->EvaluateKnownConstInt(M.getContext())
+                     .getExtValue();
+    uint32_t Z = Attr->getMaxNumWorkGroupsZ()
+                     ->EvaluateKnownConstInt(M.getContext())
+                     .getExtValue();
 
     llvm::SmallString<32> AttrVal;
     llvm::raw_svector_ostream OS(AttrVal);
     OS << X << ", " << Y << ", " << Z;
-     
+
     F->addFnAttr("amdgpu-max-num-work-groups", AttrVal.str());
   }
 }
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index a38391bda1d1ab..ac26187eb800da 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8070,7 +8070,7 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 }
 
 static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
-                                          const ParsedAttr &AL) {
+                                             const ParsedAttr &AL) {
   uint32_t NumWGX = 0;
   uint32_t NumWGY = 0;
   uint32_t NumWGZ = 0;
@@ -8084,9 +8084,9 @@ static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
   if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
     return;
 
-  if (NumWGX != 0 && NumWGY != 0 && NumWGZ !=0)
-    D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(S.Context, AL, NumWGXExpr,
-                                                       NumWGYExpr, NumWGZExpr));
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0)
+    D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
+        S.Context, AL, NumWGXExpr, NumWGYExpr, NumWGZExpr));
 }
 
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,

>From 5e5e8a478fe2c3d682456bca02e1d9a33c56bc9b Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 19 Feb 2024 13:04:36 -0600
Subject: [PATCH 07/11] Update AMDGPUUsage.rst and AttrDocs.td.

---
 clang/include/clang/Basic/AttrDocs.td |  2 ++
 llvm/docs/AMDGPUUsage.rst             | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index f9bea390736b4b..e79e94a5f14193 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2719,6 +2719,8 @@ definitions and is an optimization hint.
 
 ``<x>`` parameter specifies the maximum number of work groups in the x dimension.
 Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
+Each of the three numbers must be >=1. The attribute is ignored if any of the
+three numbers is 0.
 
 If specified, the AMDGPU target backend might be able to produce better machine
 code.
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6b2417143ca06c..34112c3ec85dda 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1312,6 +1312,11 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                              the frame. This is an internal detail of how LDS variables are lowered,
                                              language front ends should not set this attribute.
 
+     "amdgpu-max-num-work-groups"="x,y,z"    Specify the maximum number of work groups for the kernel dispatch in the
+                                             X, Y, and Z dimensions. Generated by the ``amdgpu_max_num_work_groups``
+                                             CLANG attribute [CLANG-ATTR]_. Clang only emits this attribute when all
+                                             the three numbers are >= 1.
+
      ======================================= ==========================================================
 
 Calling Conventions
@@ -3740,6 +3745,11 @@ same *vendor-name*.
 
                                                                   If omitted, "normal" is
                                                                   assumed.
+     ".max_num_work_groups_{x,y,z}"      integer                  The max number of
+                                                                  launched work-groups
+                                                                  in the X, Y, and Z
+                                                                  dimensions. Each number
+                                                                  must be >=1.
      =================================== ============== ========= ================================
 
 ..

>From 34ae13e0bebe7e9a73da3826cdc2053cba7610c8 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 22 Feb 2024 20:25:21 -0600
Subject: [PATCH 08/11] Add (1) diagnostics for the attribute (2) test cases in
 clang.

---
 clang/lib/CodeGen/Targets/AMDGPU.cpp     |  2 +-
 clang/lib/Sema/SemaDeclAttr.cpp          | 21 +++++-
 clang/test/CodeGenOpenCL/amdgpu-attrs.cl | 37 ++++++++++
 clang/test/SemaCUDA/amdgpu-attrs.cu      | 94 ++++++++++++++++++++++++
 4 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index c1d8baa94c3edd..5d6f91abc55cc9 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -370,7 +370,7 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
 
     llvm::SmallString<32> AttrVal;
     llvm::raw_svector_ostream OS(AttrVal);
-    OS << X << ", " << Y << ", " << Z;
+    OS << X << "," << Y << "," << Z;
 
     F->addFnAttr("amdgpu-max-num-work-groups", AttrVal.str());
   }
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index ac26187eb800da..8290f7a04a9569 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8071,20 +8071,33 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 
 static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
                                              const ParsedAttr &AL) {
+  if (AL.getNumArgs() != 3) {
+    S.Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 3;
+    return;
+  }
   uint32_t NumWGX = 0;
   uint32_t NumWGY = 0;
   uint32_t NumWGZ = 0;
   Expr *NumWGXExpr = AL.getArgAsExpr(0);
   Expr *NumWGYExpr = AL.getArgAsExpr(1);
   Expr *NumWGZExpr = AL.getArgAsExpr(2);
-  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
+  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX, 0, true))
     return;
-  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
+  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY, 1, true))
     return;
-  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
+  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ, 2, true))
     return;
 
-  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0)
+  if (NumWGX == 0 || NumWGY == 0 || NumWGZ == 0) {
+    Expr* E = NumWGZExpr;
+    if (NumWGY == 0)
+      E = NumWGYExpr;
+    if (NumWGX == 0)
+      E = NumWGXExpr;
+    S.Diag(AL.getLoc(), diag::err_attribute_argument_is_zero)
+        << AL << E->getSourceRange();
+  }
+  else
     D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
         S.Context, AL, NumWGXExpr, NumWGYExpr, NumWGZExpr));
 }
diff --git a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
index b0dfc97b53b2c5..3baee97bf7f81b 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
@@ -139,6 +139,36 @@ kernel void reqd_work_group_size_32_2_1_flat_work_group_size_16_128() {
 // CHECK: define{{.*}} amdgpu_kernel void @reqd_work_group_size_32_2_1_flat_work_group_size_16_128() [[FLAT_WORK_GROUP_SIZE_16_128:#[0-9]+]]
 }
 
+__attribute__((amdgpu_max_num_work_groups(1, 1, 1))) // expected-no-diagnostics
+kernel void max_num_work_groups_1_1_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_1() [[MAX_NUM_WORK_GROUPS_1_1_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1))) // expected-no-diagnostics
+kernel void max_num_work_groups_32_1_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_1_1() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(32, 8, 1))) // expected-no-diagnostics
+kernel void max_num_work_groups_32_8_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_8_1() [[MAX_NUM_WORK_GROUPS_32_8_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(1, 1, 32))) // expected-no-diagnostics
+kernel void max_num_work_groups_1_1_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_32() [[MAX_NUM_WORK_GROUPS_1_1_32:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(1, 8, 32))) // expected-no-diagnostics
+kernel void max_num_work_groups_1_8_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_8_32() [[MAX_NUM_WORK_GROUPS_1_8_32:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(4, 8, 32))) // expected-no-diagnostics
+kernel void max_num_work_groups_4_8_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_4_8_32() [[MAX_NUM_WORK_GROUPS_4_8_32:#[0-9]+]]
+}
+
 void a_function() {
 // CHECK: define{{.*}} void @a_function() [[A_FUNCTION:#[0-9]+]]
 }
@@ -189,5 +219,12 @@ kernel void default_kernel() {
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"
 
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_1_1]] = {{.*}} "amdgpu-max-num-work-groups"="1,1,1"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_1_1]] = {{.*}} "amdgpu-max-num-work-groups"="32,1,1"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_8_1]] = {{.*}} "amdgpu-max-num-work-groups"="32,8,1"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_1_32]] = {{.*}} "amdgpu-max-num-work-groups"="1,1,32"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_8_32]] = {{.*}} "amdgpu-max-num-work-groups"="1,8,32"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_4_8_32]] = {{.*}} "amdgpu-max-num-work-groups"="4,8,32"
+
 // CHECK-DAG: attributes [[A_FUNCTION]] = {{.*}}
 // CHECK-DAG: attributes [[DEFAULT_KERNEL_ATTRS]] = {{.*}} "amdgpu-flat-work-group-size"="1,256"
diff --git a/clang/test/SemaCUDA/amdgpu-attrs.cu b/clang/test/SemaCUDA/amdgpu-attrs.cu
index 4811ef796c66b3..964f88d3ac635e 100644
--- a/clang/test/SemaCUDA/amdgpu-attrs.cu
+++ b/clang/test/SemaCUDA/amdgpu-attrs.cu
@@ -63,6 +63,16 @@ __global__ void flat_work_group_size_32_64_waves_per_eu_2_num_sgpr_32_num_vgpr_6
 __attribute__((amdgpu_flat_work_group_size(32, 64), amdgpu_waves_per_eu(2, 4), amdgpu_num_sgpr(32), amdgpu_num_vgpr(64)))
 __global__ void flat_work_group_size_32_64_waves_per_eu_2_4_num_sgpr_32_num_vgpr_64() {}
 
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1)))
+__global__ void max_num_work_groups_32_1_1() {}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1), amdgpu_flat_work_group_size(32, 64)))
+__global__ void max_num_work_groups_32_1_1_flat_work_group_size_32_64() {}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1), amdgpu_flat_work_group_size(32, 64), amdgpu_waves_per_eu(2, 4), amdgpu_num_sgpr(32), amdgpu_num_vgpr(64)))
+__global__ void max_num_work_groups_32_1_1_flat_work_group_size_32_64_waves_per_eu_2_4_num_sgpr_32_num_vgpr_64() {}
+
+
 // expected-error at +2{{attribute 'reqd_work_group_size' can only be applied to an OpenCL kernel function}}
 __attribute__((reqd_work_group_size(32, 64, 64)))
 __global__ void reqd_work_group_size_32_64_64() {}
@@ -194,3 +204,87 @@ __global__ void non_cexpr_waves_per_eu_2() {}
 // expected-error at +1{{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}}
 __attribute__((amdgpu_waves_per_eu(2, ipow2(2))))
 __global__ void non_cexpr_waves_per_eu_2_4() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}}
+__attribute__((amdgpu_max_num_work_groups(32)))
+__global__ void max_num_work_groups_32() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}}
+__attribute__((amdgpu_max_num_work_groups(32, 1)))
+__global__ void max_num_work_groups_32_1() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_1() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(ipow2(5), 1, 1)))
+__global__ void max_num_work_groups_32_1_1_non_int_arg0() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(32, "1", 1)))
+__global__ void max_num_work_groups_32_1_1_non_int_arg1() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
+__attribute__((amdgpu_max_num_work_groups(-32, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_neg_int_arg0() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
+__attribute__((amdgpu_max_num_work_groups(32, -1, 1)))
+__global__ void max_num_work_groups_32_1_1_neg_int_arg1() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, -1)))
+__global__ void max_num_work_groups_32_1_1_neg_int_arg2() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+__attribute__((amdgpu_max_num_work_groups(0, 1, 1)))
+__global__ void max_num_work_groups_0_1_1() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+__attribute__((amdgpu_max_num_work_groups(32, 0, 1)))
+__global__ void max_num_work_groups_32_0_1() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, 0)))
+__global__ void max_num_work_groups_32_1_0() {}
+
+
+int num_wg_x = 32;
+int num_wg_y = 1;
+int num_wg_z = 1;
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(num_wg_x, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_non_const_arg0() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(32, num_wg_y, 1)))
+__global__ void max_num_work_groups_32_1_1_non_const_arg1() {}
+
+// expected-error at +1{{'amdgpu_max_num_work_groups' attribute requires parameter 2 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, num_wg_z)))
+__global__ void max_num_work_groups_32_1_1_non_const_arg2() {}
+
+const int c_num_wg_x = 32;
+__attribute__((amdgpu_max_num_work_groups(c_num_wg_x, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_const_arg0() {}
+
+// expected-error at +2{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}}
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(a, 1, 1)))
+__global__ void template_1_max_num_work_groups() {}
+template __global__ void template_1_max_num_work_groups<32>();
+
+// expected-error at +2{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}}
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, a, 1)))
+__global__ void template_2_max_num_work_groups() {}
+template __global__ void template_2_max_num_work_groups<1>();
+
+// expected-error at +2{{'amdgpu_max_num_work_groups' attribute requires parameter 2 to be an integer constant}}
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, 1, a)))
+__global__ void template_3_max_num_work_groups() {}
+template __global__ void template_3_max_num_work_groups<1>();
+
+

>From be217ca45f3788ec7664376dbcd20571d1270efa Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Thu, 22 Feb 2024 20:32:22 -0600
Subject: [PATCH 09/11] Fix formatting and AttrDocs.td.

---
 clang/include/clang/Basic/AttrDocs.td | 5 ++---
 clang/lib/Sema/SemaDeclAttr.cpp       | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index e79e94a5f14193..df4d97d03620b6 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2717,10 +2717,9 @@ Clang supports the
 AMDGPU target. This attribute may be attached to HIP or OpenCL kernel function
 definitions and is an optimization hint.
 
-``<x>`` parameter specifies the maximum number of work groups in the x dimension.
+The ``<x>`` parameter specifies the maximum number of work groups in the x dimension.
 Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
-Each of the three numbers must be >=1. The attribute is ignored if any of the
-three numbers is 0.
+Each of the three values must be greater than 0.
 
 If specified, the AMDGPU target backend might be able to produce better machine
 code.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 8290f7a04a9569..84579ae03c62e6 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8089,15 +8089,14 @@ static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
     return;
 
   if (NumWGX == 0 || NumWGY == 0 || NumWGZ == 0) {
-    Expr* E = NumWGZExpr;
+    Expr *E = NumWGZExpr;
     if (NumWGY == 0)
       E = NumWGYExpr;
     if (NumWGX == 0)
       E = NumWGXExpr;
     S.Diag(AL.getLoc(), diag::err_attribute_argument_is_zero)
         << AL << E->getSourceRange();
-  }
-  else
+  } else
     D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
         S.Context, AL, NumWGXExpr, NumWGYExpr, NumWGZExpr));
 }

>From 321593a7b7652339c65c111b8806ed828baeec24 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Sat, 24 Feb 2024 19:13:08 -0600
Subject: [PATCH 10/11] Allow the attribute's elements to be template
 arguments.

---
 clang/include/clang/Sema/Sema.h               | 10 +++
 clang/lib/Sema/SemaDeclAttr.cpp               | 77 +++++++++++++------
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  | 29 +++++++
 clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu |  6 ++
 clang/test/SemaCUDA/amdgpu-attrs.cu           | 36 ++++++---
 5 files changed, 125 insertions(+), 33 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 1ac2a465a0d594..8bb261d8b05831 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11320,6 +11320,16 @@ class Sema final {
   void addAMDGPUWavesPerEUAttr(Decl *D, const AttributeCommonInfo &CI,
                                Expr *Min, Expr *Max);
 
+  /// Create an AMDGPUMaxNumWorkGroupsAttr attribute.
+  AMDGPUMaxNumWorkGroupsAttr *
+  CreateAMDGPUMaxNumWorkGroupsAttr(const AttributeCommonInfo &CI, Expr *XExpr,
+                                   Expr *YExpr, Expr *ZExpr);
+
+  /// addAMDGPUMaxNumWorkGroupsAttr - Adds an amdgpu_max_num_work_groups
+  /// attribute to a particular declaration.
+  void addAMDGPUMaxNumWorkGroupsAttr(Decl *D, const AttributeCommonInfo &CI,
+                                     Expr *XExpr, Expr *YExpr, Expr *ZExpr);
+
   bool checkNSReturnsRetainedReturnType(SourceLocation loc, QualType type);
 
   //===--------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 84579ae03c62e6..e6630b5ca48e3e 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,36 +8069,65 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
+// Returns true if error
+static bool
+checkAMDGPUMaxNumWorkGroupsArguments(Sema &S, Expr *XExpr, Expr *YExpr,
+                                     Expr *ZExpr,
+                                     const AMDGPUMaxNumWorkGroupsAttr &Attr) {
+  if (S.DiagnoseUnexpandedParameterPack(XExpr) ||
+      S.DiagnoseUnexpandedParameterPack(YExpr) ||
+      S.DiagnoseUnexpandedParameterPack(ZExpr))
+    return true;
+
+  // Accept template arguments for now as they depend on something else.
+  // We'll get to check them when they eventually get instantiated.
+  if (XExpr->isValueDependent() || YExpr->isValueDependent() ||
+      ZExpr->isValueDependent())
+    return false;
+
+  uint32_t NumWG[3];
+  Expr *Exprs[3] = {XExpr, YExpr, ZExpr};
+  for (int i = 0; i < 3; i++) {
+    if (!checkUInt32Argument(S, Attr, Exprs[i], NumWG[i], i,
+                             /*StrictlyUnsigned=*/true))
+      return true;
+    if (NumWG[i] == 0) {
+      S.Diag(Attr.getLoc(), diag::err_attribute_argument_is_zero)
+          << &Attr << Exprs[i]->getSourceRange();
+      return true;
+    }
+  }
+
+  return false;
+}
+
+AMDGPUMaxNumWorkGroupsAttr *
+Sema::CreateAMDGPUMaxNumWorkGroupsAttr(const AttributeCommonInfo &CI,
+                                       Expr *XExpr, Expr *YExpr, Expr *ZExpr) {
+  AMDGPUMaxNumWorkGroupsAttr TmpAttr(Context, CI, XExpr, YExpr, ZExpr);
+
+  if (checkAMDGPUMaxNumWorkGroupsArguments(*this, XExpr, YExpr, ZExpr, TmpAttr))
+    return nullptr;
+
+  return ::new (Context)
+      AMDGPUMaxNumWorkGroupsAttr(Context, CI, XExpr, YExpr, ZExpr);
+}
+
+void Sema::addAMDGPUMaxNumWorkGroupsAttr(Decl *D, const AttributeCommonInfo &CI,
+                                         Expr *XExpr, Expr *YExpr,
+                                         Expr *ZExpr) {
+  if (auto *Attr = CreateAMDGPUMaxNumWorkGroupsAttr(CI, XExpr, YExpr, ZExpr))
+    D->addAttr(Attr);
+}
+
 static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
                                              const ParsedAttr &AL) {
   if (AL.getNumArgs() != 3) {
     S.Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 3;
     return;
   }
-  uint32_t NumWGX = 0;
-  uint32_t NumWGY = 0;
-  uint32_t NumWGZ = 0;
-  Expr *NumWGXExpr = AL.getArgAsExpr(0);
-  Expr *NumWGYExpr = AL.getArgAsExpr(1);
-  Expr *NumWGZExpr = AL.getArgAsExpr(2);
-  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX, 0, true))
-    return;
-  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY, 1, true))
-    return;
-  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ, 2, true))
-    return;
-
-  if (NumWGX == 0 || NumWGY == 0 || NumWGZ == 0) {
-    Expr *E = NumWGZExpr;
-    if (NumWGY == 0)
-      E = NumWGYExpr;
-    if (NumWGX == 0)
-      E = NumWGXExpr;
-    S.Diag(AL.getLoc(), diag::err_attribute_argument_is_zero)
-        << AL << E->getSourceRange();
-  } else
-    D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
-        S.Context, AL, NumWGXExpr, NumWGYExpr, NumWGZExpr));
+  S.addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), AL.getArgAsExpr(1),
+                                  AL.getArgAsExpr(2));
 }
 
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index fcb27a880290b8..97f15c188a03f5 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -607,6 +607,29 @@ static void instantiateDependentAMDGPUWavesPerEUAttr(
   S.addAMDGPUWavesPerEUAttr(New, Attr, MinExpr, MaxExpr);
 }
 
+static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const AMDGPUMaxNumWorkGroupsAttr &Attr, Decl *New) {
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+  ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
+  if (ResultX.isInvalid())
+    return;
+  ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
+  if (ResultY.isInvalid())
+    return;
+  ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
+  if (ResultZ.isInvalid())
+    return;
+
+  Expr *XExpr = ResultX.getAs<Expr>();
+  Expr *YExpr = ResultY.getAs<Expr>();
+  Expr *ZExpr = ResultZ.getAs<Expr>();
+
+  S.addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
 // This doesn't take any template parameters, but we have a custom action that
 // needs to happen when the kernel itself is instantiated. We need to run the
 // ItaniumMangler to mark the names required to name this kernel.
@@ -792,6 +815,12 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
                                                *AMDGPUFlatWorkGroupSize, New);
     }
 
+    if (const auto *AMDGPUMaxNumWorkGroups =
+            dyn_cast<AMDGPUMaxNumWorkGroupsAttr>(TmplAttr)) {
+      instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
+          *this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
+    }
+
     if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
       instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
                                                 New);
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
index a1642421af2c8c..26ed162fda43c7 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -40,12 +40,17 @@ __attribute__((amdgpu_num_vgpr(64))) // expected-no-diagnostics
 __global__ void num_vgpr_64() {
 // CHECK: define{{.*}} amdgpu_kernel void @_Z11num_vgpr_64v() [[NUM_VGPR_64:#[0-9]+]]
 }
+__attribute__((amdgpu_max_num_work_groups(32, 4, 2))) // expected-no-diagnostics
+__global__ void max_num_work_groups_32_4_2() {
+// CHECK: define{{.*}} amdgpu_kernel void @_Z26max_num_work_groups_32_4_2v() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+}
 
 // Make sure this is silently accepted on other targets.
 // NAMD-NOT: "amdgpu-flat-work-group-size"
 // NAMD-NOT: "amdgpu-waves-per-eu"
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
+// NAMD-NOT: "amdgpu-max-num-work-groups"
 
 // DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
@@ -53,5 +58,6 @@ __global__ void num_vgpr_64() {
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
 // CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
 // CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_4_2]] = {{.*}}"amdgpu-max-num-work-groups"="32,4,2"
 
 // NOUB-NOT: "uniform-work-group-size"="true"
diff --git a/clang/test/SemaCUDA/amdgpu-attrs.cu b/clang/test/SemaCUDA/amdgpu-attrs.cu
index 964f88d3ac635e..7655f24cbcd02f 100644
--- a/clang/test/SemaCUDA/amdgpu-attrs.cu
+++ b/clang/test/SemaCUDA/amdgpu-attrs.cu
@@ -269,22 +269,40 @@ const int c_num_wg_x = 32;
 __attribute__((amdgpu_max_num_work_groups(c_num_wg_x, 1, 1)))
 __global__ void max_num_work_groups_32_1_1_const_arg0() {}
 
-// expected-error at +2{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}}
 template<unsigned a>
 __attribute__((amdgpu_max_num_work_groups(a, 1, 1)))
-__global__ void template_1_max_num_work_groups() {}
-template __global__ void template_1_max_num_work_groups<32>();
+__global__ void template_a_1_1_max_num_work_groups() {}
+template __global__ void template_a_1_1_max_num_work_groups<32>();
 
-// expected-error at +2{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}}
 template<unsigned a>
 __attribute__((amdgpu_max_num_work_groups(32, a, 1)))
-__global__ void template_2_max_num_work_groups() {}
-template __global__ void template_2_max_num_work_groups<1>();
+__global__ void template_32_a_1_max_num_work_groups() {}
+template __global__ void template_32_a_1_max_num_work_groups<1>();
 
-// expected-error at +2{{'amdgpu_max_num_work_groups' attribute requires parameter 2 to be an integer constant}}
 template<unsigned a>
 __attribute__((amdgpu_max_num_work_groups(32, 1, a)))
-__global__ void template_3_max_num_work_groups() {}
-template __global__ void template_3_max_num_work_groups<1>();
+__global__ void template_32_1_a_max_num_work_groups() {}
+template __global__ void template_32_1_a_max_num_work_groups<1>();
+
+// expected-error at +3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+// expected-note at +4{{in instantiation of}}
+template<unsigned b>
+__attribute__((amdgpu_max_num_work_groups(b, 1, 1)))
+__global__ void template_b_1_1_max_num_work_groups() {}
+template __global__ void template_b_1_1_max_num_work_groups<0>();
+
+// expected-error at +3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+// expected-note at +4{{in instantiation of}}
+template<unsigned b>
+__attribute__((amdgpu_max_num_work_groups(32, b, 1)))
+__global__ void template_32_b_1_max_num_work_groups() {}
+template __global__ void template_32_b_1_max_num_work_groups<0>();
+
+// expected-error at +3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+// expected-note at +4{{in instantiation of}}
+template<unsigned b>
+__attribute__((amdgpu_max_num_work_groups(32, 1, b)))
+__global__ void template_32_1_b_max_num_work_groups() {}
+template __global__ void template_32_1_b_max_num_work_groups<0>();
 
 

>From 6432b7386fed04920bcf7b3e3ca46172c3ae4928 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 26 Feb 2024 13:22:18 -0600
Subject: [PATCH 11/11] Updated release notes; added codegen testcases; minor
 code changes.

---
 clang/docs/ReleaseNotes.rst                    |  6 ++++++
 clang/lib/Sema/SemaDeclAttr.cpp                |  5 -----
 clang/lib/Sema/SemaTemplateInstantiateDecl.cpp |  6 +++---
 clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu  | 18 ++++++++++++++++++
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b7b11ab3a6b2e5..9e20ef7a4915ce 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -138,6 +138,12 @@ Removed Compiler Flags
 Attribute Changes in Clang
 --------------------------
 
+- Introduced a new function attribute ``__attribute__((amdgpu_max_num_work_groups(x, y, z)))`` or
+``[[clang::amdgpu_max_num_work_groups(x, y, z)]]`` for the AMDGPU target. This attribute can be
+attached to HIP or OpenCL kernel function definitions to provide an optimization hint. The parameters
+``x``, ``y``, and ``z`` specify the maximum number of workgroups for the respective dimensions,
+and each must be a positive integer.
+
 Improvements to Clang's diagnostics
 -----------------------------------
 - Clang now applies syntax highlighting to the code snippets it
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index e6630b5ca48e3e..553d30a88156e6 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,7 +8069,6 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
-// Returns true if error
 static bool
 checkAMDGPUMaxNumWorkGroupsArguments(Sema &S, Expr *XExpr, Expr *YExpr,
                                      Expr *ZExpr,
@@ -8122,10 +8121,6 @@ void Sema::addAMDGPUMaxNumWorkGroupsAttr(Decl *D, const AttributeCommonInfo &CI,
 
 static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
                                              const ParsedAttr &AL) {
-  if (AL.getNumArgs() != 3) {
-    S.Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 3;
-    return;
-  }
   S.addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), AL.getArgAsExpr(1),
                                   AL.getArgAsExpr(2));
 }
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 97f15c188a03f5..3e820478516deb 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -614,13 +614,13 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
       S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
 
   ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
-  if (ResultX.isInvalid())
+  if (!ResultX.isUsable())
     return;
   ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
-  if (ResultY.isInvalid())
+  if (!ResultY.isUsable())
     return;
   ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
-  if (ResultZ.isInvalid())
+  if (!ResultZ.isUsable())
     return;
 
   Expr *XExpr = ResultX.getAs<Expr>();
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
index 26ed162fda43c7..a6904915cdb72c 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -45,6 +45,24 @@ __global__ void max_num_work_groups_32_4_2() {
 // CHECK: define{{.*}} amdgpu_kernel void @_Z26max_num_work_groups_32_4_2v() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
 }
 
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(a, 4, 2)))
+__global__ void template_a_4_2_max_num_work_groups() {}
+template __global__ void template_a_4_2_max_num_work_groups<32>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z34template_a_4_2_max_num_work_groupsILj32EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, a, 2)))
+__global__ void template_32_a_2_max_num_work_groups() {}
+template __global__ void template_32_a_2_max_num_work_groups<4>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_a_2_max_num_work_groupsILj4EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, 4, a)))
+__global__ void template_32_4_a_max_num_work_groups() {}
+template __global__ void template_32_4_a_max_num_work_groups<2>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_4_a_max_num_work_groupsILj2EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+
 // Make sure this is silently accepted on other targets.
 // NAMD-NOT: "amdgpu-flat-work-group-size"
 // NAMD-NOT: "amdgpu-waves-per-eu"



More information about the cfe-commits mailing list