[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)

Jun Wang via cfe-commits cfe-commits at lists.llvm.org
Mon Feb 19 11:05:35 PST 2024


https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/79035

>From 5c088a59bd36df40bae9a3a712f3994feded359d Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 22 Jan 2024 12:43:27 -0600
Subject: [PATCH 1/7] [AMDGPU] Adding the amdgpu-num-work-groups function
 attribute

A new function attribute named amdgpu-num-work-groups is added.
This attribute, which consists of three integers, allows programmers
to let the compiler know the number of workgroups to be launched in
each of the three dimensions and do optimizations based on that
information.
---
 clang/include/clang/Basic/Attr.td             |  7 ++
 clang/include/clang/Basic/AttrDocs.td         | 24 +++++++
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 13 ++++
 clang/lib/Sema/SemaDeclAttr.cpp               | 22 +++++++
 ...a-attribute-supported-attributes-list.test |  1 +
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  8 +++
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |  5 ++
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  3 +
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  2 +
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +++
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    | 53 +++++++++++++++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 19 ++++++
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 65 +++++++++++++++++++
 13 files changed, 232 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 58838b01b4fd7c..1b4718258d91e6 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2031,6 +2031,13 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
+def AMDGPUNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
+  let Documentation = [AMDGPUNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
 def AMDGPUKernelCall : DeclOrTypeAttr {
   let Spellings = [Clang<"amdgpu_kernel">];
   let Documentation = [Undocumented];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index e02a1201e2ad79..e8fd10587a8022 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2705,6 +2705,30 @@ An error will be given if:
   }];
 }
 
+def AMDGPUNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+The number of work groups specifies the number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+AMDGPU target. This attribute may be attached to a kernel function definition
+and is an optimization hint.
+
+``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
+Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
   let Content = [{
 Clang supports several different calling conventions, depending on the target
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 03ac6b78598fc8..93321efd26462c 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -356,6 +356,19 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
+    uint32_t X = Attr->getNumWorkGroupsX();
+    uint32_t Y = Attr->getNumWorkGroupsY();
+    uint32_t Z = Attr->getNumWorkGroupsZ();
+
+    if (X != 0 && Y != 0 && Z != 0) {
+      std::string AttrVal = llvm::utostr(X) + std::string(", ") +
+                            llvm::utostr(Y) + std::string(", ") +
+                            llvm::utostr(Z);
+      F->addFnAttr("amdgpu-num-work-groups", AttrVal);
+    }
+  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 069571fcf78641..98d1726bb3e0b8 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,6 +8069,25 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
+static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
+                                          const ParsedAttr &AL) {
+  uint32_t NumWGX = 0;
+  uint32_t NumWGY = 0;
+  uint32_t NumWGZ = 0;
+  Expr *NumWGXExpr = AL.getArgAsExpr(0);
+  Expr *NumWGYExpr = AL.getArgAsExpr(1);
+  Expr *NumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
+    return;
+  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
+    return;
+  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
+    return;
+
+  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
+                                                       NumWGY, NumWGZ));
+}
+
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
                                               const ParsedAttr &AL) {
   // If we try to apply it to a function pointer, don't warn, but don't
@@ -9173,6 +9192,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_AMDGPUNumWorkGroups:
+    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
     break;
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index e476c15b35ded9..3d12656612eb06 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -6,6 +6,7 @@
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 186fa58524b9f8..4ee48c6fe79088 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,6 +494,14 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
+  unsigned NumWGX = MFI.getNumWorkGroupsX();
+  unsigned NumWGY = MFI.getNumWorkGroupsY();
+  unsigned NumWGZ = MFI.getNumWorkGroupsZ();
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
+    Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
+    Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
+    Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
+  }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
   Kern[".vgpr_spill_count"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index bcc7dedf322969..4cdf61cf904984 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1108,3 +1108,8 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
 }
+
+SmallVector<unsigned>
+AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index b72697973be7a1..90c394b6e3b252 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,6 +288,9 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
+  /// Return the number of work groups for the function.
+  SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
+
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b94d143a75e5ed..fec8650f01d766 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,6 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
+  NumWorkGroups = ST.getNumWorkGroups(F);
+  assert(NumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0336ec4985ea74..18ceb282a85db2 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
+  // Default/requested number of work groups for the function.
+  SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
+
 private:
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
@@ -1072,6 +1075,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   // \returns true if a function needs or may need AGPRs.
   bool usesAGPRs(const MachineFunction &MF) const;
+
+  /// \returns Default/requested number of work groups for this function.
+  SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
+
+  unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
+  unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
+  unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 106fdb19f27895..398d78341e41cd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUAsmUtils.h"
 #include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
@@ -1253,6 +1254,58 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
   return Ints;
 }
 
+SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
+                                             unsigned Size) {
+  assert(Size > 2);
+  SmallVector<unsigned> Default(Size, 0);
+
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  SmallVector<unsigned> Vals(Size, 0);
+
+  LLVMContext &Ctx = F.getContext();
+
+  StringRef S = A.getValueAsString();
+  unsigned i = 0;
+  for (; !S.empty() && i < Size; i++) {
+    std::pair<StringRef, StringRef> Strs = S.split(',');
+    unsigned IntVal;
+    if (Strs.first.trim().getAsInteger(0, IntVal)) {
+      Ctx.emitError("can't parse integer attribute " + Strs.first + " in " +
+                    Name);
+      return Default;
+    }
+    Vals[i] = IntVal;
+    S = Strs.second;
+  }
+
+  if (!S.empty() || i < Size) {
+    Ctx.emitError("attribute " + Name +
+                  " has incorrect number of integers; expected " +
+                  llvm::utostr(Size));
+    return Default;
+  }
+  return Vals;
+}
+
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
+                                     unsigned Default) {
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  LLVMContext &Ctx = F.getContext();
+  unsigned IntVal = Default;
+  StringRef Str = A.getValueAsString();
+  if (Str.trim().getAsInteger(0, IntVal)) {
+    Ctx.emitError("can't parse integer attribute " + Name);
+    return Default;
+  }
+  return IntVal;
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 11b0bc5c81711e..92a4ab71c2f055 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -814,6 +814,15 @@ bool shouldEmitConstantsToTextSection(const Triple &TT);
 /// to integer.
 int getIntegerAttribute(const Function &F, StringRef Name, int Default);
 
+/// \returns Unsigned Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
+                                     unsigned Default);
+
 /// \returns A pair of integer values requested using \p F's \p Name attribute
 /// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
 /// is false).
@@ -828,6 +837,16 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
                         std::pair<unsigned, unsigned> Default,
                         bool OnlyFirstRequired = false);
 
+/// \returns Generate a vector of integer values requested using \p F's \p Name
+/// attribute.
+///
+/// \returns true if exactly Size (>2) number of integers are found in the
+/// attribute.
+///
+/// \returns false if any error occurs.
+SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
+                                             unsigned Size);
+
 /// Represents the counter values to wait for in an s_waitcnt instruction.
 ///
 /// Large values (including the maximum possible integer) can be used to
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
new file mode 100644
index 00000000000000..6fc6de91d1d030
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -0,0 +1,65 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+
+; Attribute not specified.
+; CHECK-LABEL: {{^}}empty_no_attribute:
+define amdgpu_kernel void @empty_no_attribute() {
+entry:
+  ret void
+}
+
+; Ignore if number of work groups for x dimension is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
+define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
+
+; Ignore if number of work groups for y dimension is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
+define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
+
+; Ignore if number of work groups for z dimension is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
+define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
+
+; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
+entry:
+  ret void
+}
+attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
+entry:
+  ret void
+}
+attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
+
+
+; CHECK: .amdgpu_metadata
+; CHECK:        .name:           empty_no_attribute
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_x0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_y0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_z0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_1_2_3
+; CHECK-NEXT:   .num_work_groups_x: 1
+; CHECK-NEXT:   .num_work_groups_y: 2
+; CHECK-NEXT:   .num_work_groups_z: 3
+; CHECK:        .name:           empty_num_work_groups_1024_1024_1024
+; CHECK-NEXT:   .num_work_groups_x: 1024
+; CHECK-NEXT:   .num_work_groups_y: 1024
+; CHECK-NEXT:   .num_work_groups_z: 1024

>From c4e460b39c5a50870929a041a89026b1135f6e0a Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Sun, 4 Feb 2024 18:23:06 -0600
Subject: [PATCH 2/7] Support 2 attributes: one for min and one for max number
 of work groups.

---
 clang/include/clang/Basic/Attr.td             |  15 +-
 clang/include/clang/Basic/AttrDocs.td         |  30 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  57 +++-
 clang/lib/Sema/SemaDeclAttr.cpp               |  52 +++-
 ...a-attribute-supported-attributes-list.test |   3 +-
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  40 ++-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |   8 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |   7 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |   6 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  16 +-
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 292 ++++++++++++++++--
 11 files changed, 444 insertions(+), 82 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 1b4718258d91e6..4044b7e1748af7 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2031,10 +2031,17 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
-def AMDGPUNumWorkGroups : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
-  let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
-  let Documentation = [AMDGPUNumWorkGroupsDocs];
+def AMDGPUMinNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_min_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"MinNumWorkGroupsX">, UnsignedArgument<"MinNumWorkGroupsY">, UnsignedArgument<"MinNumWorkGroupsZ">];
+  let Documentation = [AMDGPUMinNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
+def AMDGPUMaxNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"MaxNumWorkGroupsX">, UnsignedArgument<"MaxNumWorkGroupsY">, UnsignedArgument<"MaxNumWorkGroupsZ">];
+  let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index e8fd10587a8022..65c5e66f963fad 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2705,14 +2705,38 @@ An error will be given if:
   }];
 }
 
-def AMDGPUNumWorkGroupsDocs : Documentation {
+def AMDGPUMinNumWorkGroupsDocs : Documentation {
   let Category = DocCatAMDGPUAttributes;
   let Content = [{
-The number of work groups specifies the number of work groups when the kernel
+The min number of work groups specifies the min number of work groups when the kernel
 is dispatched.
 
 Clang supports the
-``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+AMDGPU target. This attribute may be attached to a kernel function definition
+and is an optimization hint.
+
+``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
+Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
+def AMDGPUMaxNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+The max number of work groups specifies the max number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
 AMDGPU target. This attribute may be attached to a kernel function definition
 and is an optimization hint.
 
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 93321efd26462c..d9c1807aa39705 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -357,18 +357,55 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
 
-  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
-    uint32_t X = Attr->getNumWorkGroupsX();
-    uint32_t Y = Attr->getNumWorkGroupsY();
-    uint32_t Z = Attr->getNumWorkGroupsZ();
-
-    if (X != 0 && Y != 0 && Z != 0) {
-      std::string AttrVal = llvm::utostr(X) + std::string(", ") +
-                            llvm::utostr(Y) + std::string(", ") +
-                            llvm::utostr(Z);
-      F->addFnAttr("amdgpu-num-work-groups", AttrVal);
+  uint32_t MinWGX = 0;
+  uint32_t MinWGY = 0;
+  uint32_t MinWGZ = 0;
+
+  uint32_t MaxWGX = 0;
+  uint32_t MaxWGY = 0;
+  uint32_t MaxWGZ = 0;
+
+  bool IsMinNumWGValid = false;
+  bool IsMaxNumWGValid = false;
+
+  if (const auto *Attr = FD->getAttr<AMDGPUMinNumWorkGroupsAttr>()) {
+    MinWGX = Attr->getMinNumWorkGroupsX();
+    MinWGY = Attr->getMinNumWorkGroupsY();
+    MinWGZ = Attr->getMinNumWorkGroupsZ();
+
+    if (MinWGX != 0 && MinWGY != 0 && MinWGZ != 0)
+      IsMinNumWGValid = true;
+  }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
+    MaxWGX = Attr->getMaxNumWorkGroupsX();
+    MaxWGY = Attr->getMaxNumWorkGroupsY();
+    MaxWGZ = Attr->getMaxNumWorkGroupsZ();
+
+    if (MaxWGX != 0 && MaxWGY != 0 && MaxWGZ != 0)
+      IsMaxNumWGValid = true;
+  }
+
+  if (IsMinNumWGValid && IsMaxNumWGValid) {
+    if (MinWGX > MaxWGX || MinWGY > MaxWGY || MinWGZ > MaxWGZ) {
+      IsMinNumWGValid = false;
+      IsMaxNumWGValid = false;
     }
   }
+
+  if (IsMinNumWGValid) {
+    std::string AttrVal = llvm::utostr(MinWGX) + std::string(", ") +
+                          llvm::utostr(MinWGY) + std::string(", ") +
+                          llvm::utostr(MinWGZ);
+    F->addFnAttr("amdgpu-min-num-work-groups", AttrVal);
+  }
+
+  if (IsMaxNumWGValid) {
+    std::string AttrVal = llvm::utostr(MaxWGX) + std::string(", ") +
+                          llvm::utostr(MaxWGY) + std::string(", ") +
+                          llvm::utostr(MaxWGZ);
+    F->addFnAttr("amdgpu-max-num-work-groups", AttrVal);
+  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 98d1726bb3e0b8..964bde03e31d4b 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,23 +8069,42 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
-static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
-                                          const ParsedAttr &AL) {
-  uint32_t NumWGX = 0;
-  uint32_t NumWGY = 0;
-  uint32_t NumWGZ = 0;
-  Expr *NumWGXExpr = AL.getArgAsExpr(0);
-  Expr *NumWGYExpr = AL.getArgAsExpr(1);
-  Expr *NumWGZExpr = AL.getArgAsExpr(2);
-  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
+static void handleAMDGPUMinNumWorkGroupsAttr(Sema &S, Decl *D,
+                                             const ParsedAttr &AL) {
+  uint32_t MinNumWGX = 0;
+  uint32_t MinNumWGY = 0;
+  uint32_t MinNumWGZ = 0;
+  Expr *MinNumWGXExpr = AL.getArgAsExpr(0);
+  Expr *MinNumWGYExpr = AL.getArgAsExpr(1);
+  Expr *MinNumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, MinNumWGXExpr, MinNumWGX))
+    return;
+  if (!checkUInt32Argument(S, AL, MinNumWGYExpr, MinNumWGY))
+    return;
+  if (!checkUInt32Argument(S, AL, MinNumWGZExpr, MinNumWGZ))
+    return;
+
+  D->addAttr(::new (S.Context) AMDGPUMinNumWorkGroupsAttr(
+      S.Context, AL, MinNumWGX, MinNumWGY, MinNumWGZ));
+}
+
+static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
+                                             const ParsedAttr &AL) {
+  uint32_t MaxNumWGX = 0;
+  uint32_t MaxNumWGY = 0;
+  uint32_t MaxNumWGZ = 0;
+  Expr *MaxNumWGXExpr = AL.getArgAsExpr(0);
+  Expr *MaxNumWGYExpr = AL.getArgAsExpr(1);
+  Expr *MaxNumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, MaxNumWGXExpr, MaxNumWGX))
     return;
-  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
+  if (!checkUInt32Argument(S, AL, MaxNumWGYExpr, MaxNumWGY))
     return;
-  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
+  if (!checkUInt32Argument(S, AL, MaxNumWGZExpr, MaxNumWGZ))
     return;
 
-  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
-                                                       NumWGY, NumWGZ));
+  D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
+      S.Context, AL, MaxNumWGX, MaxNumWGY, MaxNumWGZ));
 }
 
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
@@ -9192,8 +9211,11 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
-  case ParsedAttr::AT_AMDGPUNumWorkGroups:
-    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+  case ParsedAttr::AT_AMDGPUMinNumWorkGroups:
+    handleAMDGPUMinNumWorkGroupsAttr(S, D, AL);
+    break;
+  case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
+    handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
     break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 3d12656612eb06..82d8340b565ce9 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -4,9 +4,10 @@
 
 // CHECK: #pragma clang attribute supports the following attributes:
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUMinNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
-// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 4ee48c6fe79088..aaf334adea1cf9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,13 +494,39 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
-  unsigned NumWGX = MFI.getNumWorkGroupsX();
-  unsigned NumWGY = MFI.getNumWorkGroupsY();
-  unsigned NumWGZ = MFI.getNumWorkGroupsZ();
-  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
-    Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
-    Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
-    Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
+
+  unsigned MinNumWGX = MFI.getMinNumWorkGroupsX();
+  unsigned MinNumWGY = MFI.getMinNumWorkGroupsY();
+  unsigned MinNumWGZ = MFI.getMinNumWorkGroupsZ();
+
+  unsigned MaxNumWGX = MFI.getMaxNumWorkGroupsX();
+  unsigned MaxNumWGY = MFI.getMaxNumWorkGroupsY();
+  unsigned MaxNumWGZ = MFI.getMaxNumWorkGroupsZ();
+
+  bool IsMinNumWGValid = false;
+  bool IsMaxNumWGValid = false;
+  if (MinNumWGX != 0 && MinNumWGY != 0 && MinNumWGZ != 0)
+    IsMinNumWGValid = true;
+  if (MaxNumWGX != 0 && MaxNumWGY != 0 && MaxNumWGZ != 0)
+    IsMaxNumWGValid = true;
+  if (IsMinNumWGValid && IsMaxNumWGValid) {
+    if (MaxNumWGX < MinNumWGX || MaxNumWGY < MinNumWGY ||
+        MaxNumWGZ < MinNumWGZ) {
+      IsMinNumWGValid = false;
+      IsMaxNumWGValid = false;
+    }
+  }
+
+  if (IsMinNumWGValid) {
+    Kern[".min_num_work_groups_x"] = Kern.getDocument()->getNode(MinNumWGX);
+    Kern[".min_num_work_groups_y"] = Kern.getDocument()->getNode(MinNumWGY);
+    Kern[".min_num_work_groups_z"] = Kern.getDocument()->getNode(MinNumWGZ);
+  }
+
+  if (IsMaxNumWGValid) {
+    Kern[".max_num_work_groups_x"] = Kern.getDocument()->getNode(MaxNumWGX);
+    Kern[".max_num_work_groups_y"] = Kern.getDocument()->getNode(MaxNumWGY);
+    Kern[".max_num_work_groups_z"] = Kern.getDocument()->getNode(MaxNumWGZ);
   }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 4cdf61cf904984..6c9f0900e96f60 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1110,6 +1110,10 @@ unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
 }
 
 SmallVector<unsigned>
-AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
-  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
+AMDGPUSubtarget::getMinNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-min-num-work-groups", 3);
+}
+SmallVector<unsigned>
+AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-work-groups", 3);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 90c394b6e3b252..23fd75d13f199c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,8 +288,11 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
-  /// Return the number of work groups for the function.
-  SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
+  /// Return the min number of work groups for the function.
+  SmallVector<unsigned> getMinNumWorkGroups(const Function &F) const;
+
+  /// Return the max number of work groups for the function.
+  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
 
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index fec8650f01d766..152d8a1ee6ec08 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,8 +46,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
-  NumWorkGroups = ST.getNumWorkGroups(F);
-  assert(NumWorkGroups.size() == 3);
+  MinNumWorkGroups = ST.getMinNumWorkGroups(F);
+  assert(MinNumWorkGroups.size() == 3);
+  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
+  assert(MaxNumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 18ceb282a85db2..35f43a6d35b948 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -427,7 +427,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
   // Default/requested number of work groups for the function.
-  SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
+  SmallVector<unsigned> MinNumWorkGroups = {0, 0, 0};
+  SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
 
 private:
   unsigned NumUserSGPRs = 0;
@@ -1077,11 +1078,16 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   bool usesAGPRs(const MachineFunction &MF) const;
 
   /// \returns Default/requested number of work groups for this function.
-  SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
+  SmallVector<unsigned> getMinNumWorkGroups() const { return MinNumWorkGroups; }
+  SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
 
-  unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
-  unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
-  unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
+  unsigned getMinNumWorkGroupsX() const { return MinNumWorkGroups[0]; }
+  unsigned getMinNumWorkGroupsY() const { return MinNumWorkGroups[1]; }
+  unsigned getMinNumWorkGroupsZ() const { return MinNumWorkGroups[2]; }
+
+  unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
+  unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
+  unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
index 6fc6de91d1d030..eec7819526238d 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -8,58 +8,288 @@ entry:
 }
 
 ; Ignore if number of work groups for x dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
-define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
+; CHECK-LABEL: {{^}}empty_min_num_work_groups_x0:
+define amdgpu_kernel void @empty_min_num_work_groups_x0() #0 {
 entry:
   ret void
 }
-attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
+attributes #0 = {"amdgpu-min-num-work-groups"="0,2,3"}
+
+; Ignore if number of work groups for x dimension is 0.
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_x0:
+define amdgpu_kernel void @empty_max_num_work_groups_x0() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-max-num-work-groups"="0,2,3"}
+
+; Ignore if number of work groups for y dimension is 0.
+; CHECK-LABEL: {{^}}empty_min_num_work_groups_y0:
+define amdgpu_kernel void @empty_min_num_work_groups_y0() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-min-num-work-groups"="1,0,3"}
 
 ; Ignore if number of work groups for y dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
-define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_y0:
+define amdgpu_kernel void @empty_max_num_work_groups_y0() #3 {
 entry:
   ret void
 }
-attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
+attributes #3 = {"amdgpu-max-num-work-groups"="1,0,3"}
 
 ; Ignore if number of work groups for z dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
-define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
+; CHECK-LABEL: {{^}}empty_min_num_work_groups_z0:
+define amdgpu_kernel void @empty_min_num_work_groups_z0() #4 {
+entry:
+  ret void
+}
+attributes #4 = {"amdgpu-min-num-work-groups"="1,2,0"}
+
+; Ignore if number of work groups for z dimension is 0.
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_z0:
+define amdgpu_kernel void @empty_max_num_work_groups_z0() #5 {
+entry:
+  ret void
+}
+attributes #5 = {"amdgpu-max-num-work-groups"="1,2,0"}
+
+
+
+; CHECK-LABEL: {{^}}empty_min_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_min_num_work_groups_1_2_3() #20 {
+entry:
+  ret void
+}
+attributes #20 = {"amdgpu-min-num-work-groups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_max_num_work_groups_1_2_3() #21 {
+entry:
+  ret void
+}
+attributes #21 = {"amdgpu-max-num-work-groups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_min_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_min_num_work_groups_1024_1024_1024() #22 {
+entry:
+  ret void
+}
+attributes #22 = {"amdgpu-min-num-work-groups"="1024,1024,1024"}
+
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_max_num_work_groups_1024_1024_1024() #23 {
+entry:
+  ret void
+}
+attributes #23 = {"amdgpu-max-num-work-groups"="1024,1024,1024"}
+
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_min:
+define amdgpu_kernel void @empty_min_max_num_work_groups_bad_min() #30 {
+entry:
+  ret void
+}
+attributes #30 = {"amdgpu-min-num-work-groups"="0,2,3" "amdgpu-max-num-work-groups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_max:
+define amdgpu_kernel void @empty_min_max_num_work_groups_bad_max() #31 {
+entry:
+  ret void
+}
+attributes #31 = {"amdgpu-min-num-work-groups"="1,2,3" "amdgpu-max-num-work-groups"="0,2,3"}
+
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_x:
+define amdgpu_kernel void @empty_min_max_num_work_groups_bad_x() #40 {
+entry:
+  ret void
+}
+attributes #40 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="1,3,4"}
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_y:
+define amdgpu_kernel void @empty_min_max_num_work_groups_bad_y() #41 {
 entry:
   ret void
 }
-attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
+attributes #41 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,1,4"}
 
-; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
-define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_z:
+define amdgpu_kernel void @empty_min_max_num_work_groups_bad_z() #42 {
 entry:
   ret void
 }
-attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
+attributes #42 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,3,1"}
 
-; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
-define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_equal1:
+define amdgpu_kernel void @empty_min_max_num_work_groups_equal1() #50 {
 entry:
   ret void
 }
-attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
+attributes #50 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,3,4"}
+
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal1:
+define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal1() #60 {
+entry:
+  ret void
+}
+attributes #60 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,30,40"}
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal2:
+define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal2() #61 {
+entry:
+  ret void
+}
+attributes #61 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,3,40"}
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal3:
+define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal3() #62 {
+entry:
+  ret void
+}
+attributes #62 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,30,4"}
+
+
+; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater1:
+define amdgpu_kernel void @empty_min_max_num_work_groups_greater1() #62 {
+entry:
+  ret void
+}
+attributes #62 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,30,40"}
 
 
 ; CHECK: .amdgpu_metadata
-; CHECK:        .name:           empty_no_attribute
-; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_x0
-; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_y0
-; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_z0
-; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_1_2_3
-; CHECK-NEXT:   .num_work_groups_x: 1
-; CHECK-NEXT:   .num_work_groups_y: 2
-; CHECK-NEXT:   .num_work_groups_z: 3
-; CHECK:        .name:           empty_num_work_groups_1024_1024_1024
-; CHECK-NEXT:   .num_work_groups_x: 1024
-; CHECK-NEXT:   .num_work_groups_y: 1024
-; CHECK-NEXT:   .num_work_groups_z: 1024
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_no_attribute
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_num_work_groups_x0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_x0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_num_work_groups_y0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_y0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_num_work_groups_z0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_z0
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .min_num_work_groups_x: 1
+; CHECK-NEXT:   .min_num_work_groups_y: 2
+; CHECK-NEXT:   .min_num_work_groups_z: 3
+; CHECK:        .name:           empty_min_num_work_groups_1_2_3
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 1
+; CHECK-NEXT:   .max_num_work_groups_y: 2
+; CHECK-NEXT:   .max_num_work_groups_z: 3
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_1_2_3
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .min_num_work_groups_x: 1024
+; CHECK-NEXT:   .min_num_work_groups_y: 1024
+; CHECK-NEXT:   .min_num_work_groups_z: 1024
+; CHECK-NEXT:   .name:           empty_min_num_work_groups_1024_1024_1024
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 1024
+; CHECK-NEXT:   .max_num_work_groups_y: 1024
+; CHECK-NEXT:   .max_num_work_groups_z: 1024
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_1024_1024_1024
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 1
+; CHECK-NEXT:   .max_num_work_groups_y: 2
+; CHECK-NEXT:   .max_num_work_groups_z: 3
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_min
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .min_num_work_groups_x: 1
+; CHECK-NEXT:   .min_num_work_groups_y: 2
+; CHECK-NEXT:   .min_num_work_groups_z: 3
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_max
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_x
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_y
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_z
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 2
+; CHECK-NEXT:   .max_num_work_groups_y: 3
+; CHECK-NEXT:   .max_num_work_groups_z: 4
+; CHECK-NEXT:   .min_num_work_groups_x: 2
+; CHECK-NEXT:   .min_num_work_groups_y: 3
+; CHECK-NEXT:   .min_num_work_groups_z: 4
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_equal1
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 2
+; CHECK-NEXT:   .max_num_work_groups_y: 30
+; CHECK-NEXT:   .max_num_work_groups_z: 40
+; CHECK-NEXT:   .min_num_work_groups_x: 2
+; CHECK-NEXT:   .min_num_work_groups_y: 3
+; CHECK-NEXT:   .min_num_work_groups_z: 4
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal1
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 20
+; CHECK-NEXT:   .max_num_work_groups_y: 3
+; CHECK-NEXT:   .max_num_work_groups_z: 40
+; CHECK-NEXT:   .min_num_work_groups_x: 2
+; CHECK-NEXT:   .min_num_work_groups_y: 3
+; CHECK-NEXT:   .min_num_work_groups_z: 4
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal2
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 20
+; CHECK-NEXT:   .max_num_work_groups_y: 30
+; CHECK-NEXT:   .max_num_work_groups_z: 4
+; CHECK-NEXT:   .min_num_work_groups_x: 2
+; CHECK-NEXT:   .min_num_work_groups_y: 3
+; CHECK-NEXT:   .min_num_work_groups_z: 4
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal3
+
+; CHECK:        .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 20
+; CHECK-NEXT:   .max_num_work_groups_y: 30
+; CHECK-NEXT:   .max_num_work_groups_z: 40
+; CHECK-NEXT:   .min_num_work_groups_x: 2
+; CHECK-NEXT:   .min_num_work_groups_y: 3
+; CHECK-NEXT:   .min_num_work_groups_z: 4
+; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater1

>From 5485ab8007b733811f7f3d3e4ec1cfc518c7aae6 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 9 Feb 2024 15:57:07 -0600
Subject: [PATCH 3/7] Revert "Support 2 attributes: one for min and one for max
 number of work groups."

This reverts commit c4e460b39c5a50870929a041a89026b1135f6e0a.
---
 clang/include/clang/Basic/Attr.td             |  15 +-
 clang/include/clang/Basic/AttrDocs.td         |  30 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  57 +---
 clang/lib/Sema/SemaDeclAttr.cpp               |  52 +---
 ...a-attribute-supported-attributes-list.test |   3 +-
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  40 +--
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |   8 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |   7 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |   6 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  16 +-
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 292 ++----------------
 11 files changed, 82 insertions(+), 444 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 4044b7e1748af7..1b4718258d91e6 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2031,17 +2031,10 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
-def AMDGPUMinNumWorkGroups : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_min_num_work_groups", 0>];
-  let Args = [UnsignedArgument<"MinNumWorkGroupsX">, UnsignedArgument<"MinNumWorkGroupsY">, UnsignedArgument<"MinNumWorkGroupsZ">];
-  let Documentation = [AMDGPUMinNumWorkGroupsDocs];
-  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
-}
-
-def AMDGPUMaxNumWorkGroups : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
-  let Args = [UnsignedArgument<"MaxNumWorkGroupsX">, UnsignedArgument<"MaxNumWorkGroupsY">, UnsignedArgument<"MaxNumWorkGroupsZ">];
-  let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
+def AMDGPUNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
+  let Documentation = [AMDGPUNumWorkGroupsDocs];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 65c5e66f963fad..e8fd10587a8022 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2705,38 +2705,14 @@ An error will be given if:
   }];
 }
 
-def AMDGPUMinNumWorkGroupsDocs : Documentation {
+def AMDGPUNumWorkGroupsDocs : Documentation {
   let Category = DocCatAMDGPUAttributes;
   let Content = [{
-The min number of work groups specifies the min number of work groups when the kernel
+The number of work groups specifies the number of work groups when the kernel
 is dispatched.
 
 Clang supports the
-``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
-AMDGPU target. This attribute may be attached to a kernel function definition
-and is an optimization hint.
-
-``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
-Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
-
-If specified, the AMDGPU target backend might be able to produce better machine
-code.
-
-An error will be given if:
-  - Specified values violate subtarget specifications;
-  - Specified values are not compatible with values provided through other
-    attributes.
-  }];
-}
-
-def AMDGPUMaxNumWorkGroupsDocs : Documentation {
-  let Category = DocCatAMDGPUAttributes;
-  let Content = [{
-The max number of work groups specifies the max number of work groups when the kernel
-is dispatched.
-
-Clang supports the
-``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
 AMDGPU target. This attribute may be attached to a kernel function definition
 and is an optimization hint.
 
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index d9c1807aa39705..93321efd26462c 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -357,55 +357,18 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
 
-  uint32_t MinWGX = 0;
-  uint32_t MinWGY = 0;
-  uint32_t MinWGZ = 0;
-
-  uint32_t MaxWGX = 0;
-  uint32_t MaxWGY = 0;
-  uint32_t MaxWGZ = 0;
-
-  bool IsMinNumWGValid = false;
-  bool IsMaxNumWGValid = false;
-
-  if (const auto *Attr = FD->getAttr<AMDGPUMinNumWorkGroupsAttr>()) {
-    MinWGX = Attr->getMinNumWorkGroupsX();
-    MinWGY = Attr->getMinNumWorkGroupsY();
-    MinWGZ = Attr->getMinNumWorkGroupsZ();
-
-    if (MinWGX != 0 && MinWGY != 0 && MinWGZ != 0)
-      IsMinNumWGValid = true;
-  }
-
-  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
-    MaxWGX = Attr->getMaxNumWorkGroupsX();
-    MaxWGY = Attr->getMaxNumWorkGroupsY();
-    MaxWGZ = Attr->getMaxNumWorkGroupsZ();
-
-    if (MaxWGX != 0 && MaxWGY != 0 && MaxWGZ != 0)
-      IsMaxNumWGValid = true;
-  }
-
-  if (IsMinNumWGValid && IsMaxNumWGValid) {
-    if (MinWGX > MaxWGX || MinWGY > MaxWGY || MinWGZ > MaxWGZ) {
-      IsMinNumWGValid = false;
-      IsMaxNumWGValid = false;
+  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
+    uint32_t X = Attr->getNumWorkGroupsX();
+    uint32_t Y = Attr->getNumWorkGroupsY();
+    uint32_t Z = Attr->getNumWorkGroupsZ();
+
+    if (X != 0 && Y != 0 && Z != 0) {
+      std::string AttrVal = llvm::utostr(X) + std::string(", ") +
+                            llvm::utostr(Y) + std::string(", ") +
+                            llvm::utostr(Z);
+      F->addFnAttr("amdgpu-num-work-groups", AttrVal);
     }
   }
-
-  if (IsMinNumWGValid) {
-    std::string AttrVal = llvm::utostr(MinWGX) + std::string(", ") +
-                          llvm::utostr(MinWGY) + std::string(", ") +
-                          llvm::utostr(MinWGZ);
-    F->addFnAttr("amdgpu-min-num-work-groups", AttrVal);
-  }
-
-  if (IsMaxNumWGValid) {
-    std::string AttrVal = llvm::utostr(MaxWGX) + std::string(", ") +
-                          llvm::utostr(MaxWGY) + std::string(", ") +
-                          llvm::utostr(MaxWGZ);
-    F->addFnAttr("amdgpu-max-num-work-groups", AttrVal);
-  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 964bde03e31d4b..98d1726bb3e0b8 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,42 +8069,23 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
-static void handleAMDGPUMinNumWorkGroupsAttr(Sema &S, Decl *D,
-                                             const ParsedAttr &AL) {
-  uint32_t MinNumWGX = 0;
-  uint32_t MinNumWGY = 0;
-  uint32_t MinNumWGZ = 0;
-  Expr *MinNumWGXExpr = AL.getArgAsExpr(0);
-  Expr *MinNumWGYExpr = AL.getArgAsExpr(1);
-  Expr *MinNumWGZExpr = AL.getArgAsExpr(2);
-  if (!checkUInt32Argument(S, AL, MinNumWGXExpr, MinNumWGX))
-    return;
-  if (!checkUInt32Argument(S, AL, MinNumWGYExpr, MinNumWGY))
-    return;
-  if (!checkUInt32Argument(S, AL, MinNumWGZExpr, MinNumWGZ))
-    return;
-
-  D->addAttr(::new (S.Context) AMDGPUMinNumWorkGroupsAttr(
-      S.Context, AL, MinNumWGX, MinNumWGY, MinNumWGZ));
-}
-
-static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
-                                             const ParsedAttr &AL) {
-  uint32_t MaxNumWGX = 0;
-  uint32_t MaxNumWGY = 0;
-  uint32_t MaxNumWGZ = 0;
-  Expr *MaxNumWGXExpr = AL.getArgAsExpr(0);
-  Expr *MaxNumWGYExpr = AL.getArgAsExpr(1);
-  Expr *MaxNumWGZExpr = AL.getArgAsExpr(2);
-  if (!checkUInt32Argument(S, AL, MaxNumWGXExpr, MaxNumWGX))
+static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
+                                          const ParsedAttr &AL) {
+  uint32_t NumWGX = 0;
+  uint32_t NumWGY = 0;
+  uint32_t NumWGZ = 0;
+  Expr *NumWGXExpr = AL.getArgAsExpr(0);
+  Expr *NumWGYExpr = AL.getArgAsExpr(1);
+  Expr *NumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
     return;
-  if (!checkUInt32Argument(S, AL, MaxNumWGYExpr, MaxNumWGY))
+  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
     return;
-  if (!checkUInt32Argument(S, AL, MaxNumWGZExpr, MaxNumWGZ))
+  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
     return;
 
-  D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
-      S.Context, AL, MaxNumWGX, MaxNumWGY, MaxNumWGZ));
+  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
+                                                       NumWGY, NumWGZ));
 }
 
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
@@ -9211,11 +9192,8 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
-  case ParsedAttr::AT_AMDGPUMinNumWorkGroups:
-    handleAMDGPUMinNumWorkGroupsAttr(S, D, AL);
-    break;
-  case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
-    handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
+  case ParsedAttr::AT_AMDGPUNumWorkGroups:
+    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
     break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 82d8340b565ce9..3d12656612eb06 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -4,10 +4,9 @@
 
 // CHECK: #pragma clang attribute supports the following attributes:
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
-// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
-// CHECK-NEXT: AMDGPUMinNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index aaf334adea1cf9..4ee48c6fe79088 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,39 +494,13 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
-
-  unsigned MinNumWGX = MFI.getMinNumWorkGroupsX();
-  unsigned MinNumWGY = MFI.getMinNumWorkGroupsY();
-  unsigned MinNumWGZ = MFI.getMinNumWorkGroupsZ();
-
-  unsigned MaxNumWGX = MFI.getMaxNumWorkGroupsX();
-  unsigned MaxNumWGY = MFI.getMaxNumWorkGroupsY();
-  unsigned MaxNumWGZ = MFI.getMaxNumWorkGroupsZ();
-
-  bool IsMinNumWGValid = false;
-  bool IsMaxNumWGValid = false;
-  if (MinNumWGX != 0 && MinNumWGY != 0 && MinNumWGZ != 0)
-    IsMinNumWGValid = true;
-  if (MaxNumWGX != 0 && MaxNumWGY != 0 && MaxNumWGZ != 0)
-    IsMaxNumWGValid = true;
-  if (IsMinNumWGValid && IsMaxNumWGValid) {
-    if (MaxNumWGX < MinNumWGX || MaxNumWGY < MinNumWGY ||
-        MaxNumWGZ < MinNumWGZ) {
-      IsMinNumWGValid = false;
-      IsMaxNumWGValid = false;
-    }
-  }
-
-  if (IsMinNumWGValid) {
-    Kern[".min_num_work_groups_x"] = Kern.getDocument()->getNode(MinNumWGX);
-    Kern[".min_num_work_groups_y"] = Kern.getDocument()->getNode(MinNumWGY);
-    Kern[".min_num_work_groups_z"] = Kern.getDocument()->getNode(MinNumWGZ);
-  }
-
-  if (IsMaxNumWGValid) {
-    Kern[".max_num_work_groups_x"] = Kern.getDocument()->getNode(MaxNumWGX);
-    Kern[".max_num_work_groups_y"] = Kern.getDocument()->getNode(MaxNumWGY);
-    Kern[".max_num_work_groups_z"] = Kern.getDocument()->getNode(MaxNumWGZ);
+  unsigned NumWGX = MFI.getNumWorkGroupsX();
+  unsigned NumWGY = MFI.getNumWorkGroupsY();
+  unsigned NumWGZ = MFI.getNumWorkGroupsZ();
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
+    Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
+    Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
+    Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
   }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 6c9f0900e96f60..4cdf61cf904984 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1110,10 +1110,6 @@ unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
 }
 
 SmallVector<unsigned>
-AMDGPUSubtarget::getMinNumWorkGroups(const Function &F) const {
-  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-min-num-work-groups", 3);
-}
-SmallVector<unsigned>
-AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
-  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-work-groups", 3);
+AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 23fd75d13f199c..90c394b6e3b252 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,11 +288,8 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
-  /// Return the min number of work groups for the function.
-  SmallVector<unsigned> getMinNumWorkGroups(const Function &F) const;
-
-  /// Return the max number of work groups for the function.
-  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
+  /// Return the number of work groups for the function.
+  SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
 
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 152d8a1ee6ec08..fec8650f01d766 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,10 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
-  MinNumWorkGroups = ST.getMinNumWorkGroups(F);
-  assert(MinNumWorkGroups.size() == 3);
-  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
-  assert(MaxNumWorkGroups.size() == 3);
+  NumWorkGroups = ST.getNumWorkGroups(F);
+  assert(NumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 35f43a6d35b948..18ceb282a85db2 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -427,8 +427,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
   // Default/requested number of work groups for the function.
-  SmallVector<unsigned> MinNumWorkGroups = {0, 0, 0};
-  SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
+  SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
 
 private:
   unsigned NumUserSGPRs = 0;
@@ -1078,16 +1077,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   bool usesAGPRs(const MachineFunction &MF) const;
 
   /// \returns Default/requested number of work groups for this function.
-  SmallVector<unsigned> getMinNumWorkGroups() const { return MinNumWorkGroups; }
-  SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
+  SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
 
-  unsigned getMinNumWorkGroupsX() const { return MinNumWorkGroups[0]; }
-  unsigned getMinNumWorkGroupsY() const { return MinNumWorkGroups[1]; }
-  unsigned getMinNumWorkGroupsZ() const { return MinNumWorkGroups[2]; }
-
-  unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
-  unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
-  unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
+  unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
+  unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
+  unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
index eec7819526238d..6fc6de91d1d030 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -8,288 +8,58 @@ entry:
 }
 
 ; Ignore if number of work groups for x dimension is 0.
-; CHECK-LABEL: {{^}}empty_min_num_work_groups_x0:
-define amdgpu_kernel void @empty_min_num_work_groups_x0() #0 {
+; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
+define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
 entry:
   ret void
 }
-attributes #0 = {"amdgpu-min-num-work-groups"="0,2,3"}
-
-; Ignore if number of work groups for x dimension is 0.
-; CHECK-LABEL: {{^}}empty_max_num_work_groups_x0:
-define amdgpu_kernel void @empty_max_num_work_groups_x0() #1 {
-entry:
-  ret void
-}
-attributes #1 = {"amdgpu-max-num-work-groups"="0,2,3"}
-
-; Ignore if number of work groups for y dimension is 0.
-; CHECK-LABEL: {{^}}empty_min_num_work_groups_y0:
-define amdgpu_kernel void @empty_min_num_work_groups_y0() #2 {
-entry:
-  ret void
-}
-attributes #2 = {"amdgpu-min-num-work-groups"="1,0,3"}
+attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
 
 ; Ignore if number of work groups for y dimension is 0.
-; CHECK-LABEL: {{^}}empty_max_num_work_groups_y0:
-define amdgpu_kernel void @empty_max_num_work_groups_y0() #3 {
+; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
+define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
 entry:
   ret void
 }
-attributes #3 = {"amdgpu-max-num-work-groups"="1,0,3"}
+attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
 
 ; Ignore if number of work groups for z dimension is 0.
-; CHECK-LABEL: {{^}}empty_min_num_work_groups_z0:
-define amdgpu_kernel void @empty_min_num_work_groups_z0() #4 {
-entry:
-  ret void
-}
-attributes #4 = {"amdgpu-min-num-work-groups"="1,2,0"}
-
-; Ignore if number of work groups for z dimension is 0.
-; CHECK-LABEL: {{^}}empty_max_num_work_groups_z0:
-define amdgpu_kernel void @empty_max_num_work_groups_z0() #5 {
-entry:
-  ret void
-}
-attributes #5 = {"amdgpu-max-num-work-groups"="1,2,0"}
-
-
-
-; CHECK-LABEL: {{^}}empty_min_num_work_groups_1_2_3:
-define amdgpu_kernel void @empty_min_num_work_groups_1_2_3() #20 {
-entry:
-  ret void
-}
-attributes #20 = {"amdgpu-min-num-work-groups"="1,2,3"}
-
-; CHECK-LABEL: {{^}}empty_max_num_work_groups_1_2_3:
-define amdgpu_kernel void @empty_max_num_work_groups_1_2_3() #21 {
-entry:
-  ret void
-}
-attributes #21 = {"amdgpu-max-num-work-groups"="1,2,3"}
-
-; CHECK-LABEL: {{^}}empty_min_num_work_groups_1024_1024_1024:
-define amdgpu_kernel void @empty_min_num_work_groups_1024_1024_1024() #22 {
-entry:
-  ret void
-}
-attributes #22 = {"amdgpu-min-num-work-groups"="1024,1024,1024"}
-
-; CHECK-LABEL: {{^}}empty_max_num_work_groups_1024_1024_1024:
-define amdgpu_kernel void @empty_max_num_work_groups_1024_1024_1024() #23 {
-entry:
-  ret void
-}
-attributes #23 = {"amdgpu-max-num-work-groups"="1024,1024,1024"}
-
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_min:
-define amdgpu_kernel void @empty_min_max_num_work_groups_bad_min() #30 {
-entry:
-  ret void
-}
-attributes #30 = {"amdgpu-min-num-work-groups"="0,2,3" "amdgpu-max-num-work-groups"="1,2,3"}
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_max:
-define amdgpu_kernel void @empty_min_max_num_work_groups_bad_max() #31 {
-entry:
-  ret void
-}
-attributes #31 = {"amdgpu-min-num-work-groups"="1,2,3" "amdgpu-max-num-work-groups"="0,2,3"}
-
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_x:
-define amdgpu_kernel void @empty_min_max_num_work_groups_bad_x() #40 {
-entry:
-  ret void
-}
-attributes #40 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="1,3,4"}
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_y:
-define amdgpu_kernel void @empty_min_max_num_work_groups_bad_y() #41 {
+; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
+define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
 entry:
   ret void
 }
-attributes #41 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,1,4"}
+attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
 
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_bad_z:
-define amdgpu_kernel void @empty_min_max_num_work_groups_bad_z() #42 {
+; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
 entry:
   ret void
 }
-attributes #42 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,3,1"}
+attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
 
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_equal1:
-define amdgpu_kernel void @empty_min_max_num_work_groups_equal1() #50 {
+; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
 entry:
   ret void
 }
-attributes #50 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,3,4"}
-
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal1:
-define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal1() #60 {
-entry:
-  ret void
-}
-attributes #60 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="2,30,40"}
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal2:
-define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal2() #61 {
-entry:
-  ret void
-}
-attributes #61 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,3,40"}
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater_or_equal3:
-define amdgpu_kernel void @empty_min_max_num_work_groups_greater_or_equal3() #62 {
-entry:
-  ret void
-}
-attributes #62 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,30,4"}
-
-
-; CHECK-LABEL: {{^}}empty_min_max_num_work_groups_greater1:
-define amdgpu_kernel void @empty_min_max_num_work_groups_greater1() #62 {
-entry:
-  ret void
-}
-attributes #62 = {"amdgpu-min-num-work-groups"="2,3,4" "amdgpu-max-num-work-groups"="20,30,40"}
+attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
 
 
 ; CHECK: .amdgpu_metadata
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_no_attribute
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_num_work_groups_x0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_max_num_work_groups_x0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_num_work_groups_y0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_max_num_work_groups_y0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_num_work_groups_z0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_max_num_work_groups_z0
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .min_num_work_groups_x: 1
-; CHECK-NEXT:   .min_num_work_groups_y: 2
-; CHECK-NEXT:   .min_num_work_groups_z: 3
-; CHECK:        .name:           empty_min_num_work_groups_1_2_3
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 1
-; CHECK-NEXT:   .max_num_work_groups_y: 2
-; CHECK-NEXT:   .max_num_work_groups_z: 3
-; CHECK-NEXT:   .name:           empty_max_num_work_groups_1_2_3
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .min_num_work_groups_x: 1024
-; CHECK-NEXT:   .min_num_work_groups_y: 1024
-; CHECK-NEXT:   .min_num_work_groups_z: 1024
-; CHECK-NEXT:   .name:           empty_min_num_work_groups_1024_1024_1024
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 1024
-; CHECK-NEXT:   .max_num_work_groups_y: 1024
-; CHECK-NEXT:   .max_num_work_groups_z: 1024
-; CHECK-NEXT:   .name:           empty_max_num_work_groups_1024_1024_1024
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 1
-; CHECK-NEXT:   .max_num_work_groups_y: 2
-; CHECK-NEXT:   .max_num_work_groups_z: 3
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_min
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .min_num_work_groups_x: 1
-; CHECK-NEXT:   .min_num_work_groups_y: 2
-; CHECK-NEXT:   .min_num_work_groups_z: 3
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_max
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_x
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_y
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_bad_z
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 2
-; CHECK-NEXT:   .max_num_work_groups_y: 3
-; CHECK-NEXT:   .max_num_work_groups_z: 4
-; CHECK-NEXT:   .min_num_work_groups_x: 2
-; CHECK-NEXT:   .min_num_work_groups_y: 3
-; CHECK-NEXT:   .min_num_work_groups_z: 4
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_equal1
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 2
-; CHECK-NEXT:   .max_num_work_groups_y: 30
-; CHECK-NEXT:   .max_num_work_groups_z: 40
-; CHECK-NEXT:   .min_num_work_groups_x: 2
-; CHECK-NEXT:   .min_num_work_groups_y: 3
-; CHECK-NEXT:   .min_num_work_groups_z: 4
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal1
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 20
-; CHECK-NEXT:   .max_num_work_groups_y: 3
-; CHECK-NEXT:   .max_num_work_groups_z: 40
-; CHECK-NEXT:   .min_num_work_groups_x: 2
-; CHECK-NEXT:   .min_num_work_groups_y: 3
-; CHECK-NEXT:   .min_num_work_groups_z: 4
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal2
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 20
-; CHECK-NEXT:   .max_num_work_groups_y: 30
-; CHECK-NEXT:   .max_num_work_groups_z: 4
-; CHECK-NEXT:   .min_num_work_groups_x: 2
-; CHECK-NEXT:   .min_num_work_groups_y: 3
-; CHECK-NEXT:   .min_num_work_groups_z: 4
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater_or_equal3
-
-; CHECK:        .args:
-; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .max_num_work_groups_x: 20
-; CHECK-NEXT:   .max_num_work_groups_y: 30
-; CHECK-NEXT:   .max_num_work_groups_z: 40
-; CHECK-NEXT:   .min_num_work_groups_x: 2
-; CHECK-NEXT:   .min_num_work_groups_y: 3
-; CHECK-NEXT:   .min_num_work_groups_z: 4
-; CHECK-NEXT:   .name:           empty_min_max_num_work_groups_greater1
+; CHECK:        .name:           empty_no_attribute
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_x0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_y0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_z0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_1_2_3
+; CHECK-NEXT:   .num_work_groups_x: 1
+; CHECK-NEXT:   .num_work_groups_y: 2
+; CHECK-NEXT:   .num_work_groups_z: 3
+; CHECK:        .name:           empty_num_work_groups_1024_1024_1024
+; CHECK-NEXT:   .num_work_groups_x: 1024
+; CHECK-NEXT:   .num_work_groups_y: 1024
+; CHECK-NEXT:   .num_work_groups_z: 1024

>From 09d4867a59099b2fa5ea204e001f49918887d651 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 9 Feb 2024 17:00:12 -0600
Subject: [PATCH 4/7] Minor change to the test file

---
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 31 +++++++++++++++----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
index 6fc6de91d1d030..8b1ee07972c86a 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -47,19 +47,38 @@ attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
 
 
 ; CHECK: .amdgpu_metadata
-; CHECK:        .name:           empty_no_attribute
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_no_attribute
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_x0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_num_work_groups_x0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_y0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_num_work_groups_y0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_z0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_num_work_groups_z0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
-; CHECK:        .name:           empty_num_work_groups_1_2_3
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_num_work_groups_1_2_3
 ; CHECK-NEXT:   .num_work_groups_x: 1
 ; CHECK-NEXT:   .num_work_groups_y: 2
 ; CHECK-NEXT:   .num_work_groups_z: 3
-; CHECK:        .name:           empty_num_work_groups_1024_1024_1024
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_num_work_groups_1024_1024_1024
 ; CHECK-NEXT:   .num_work_groups_x: 1024
 ; CHECK-NEXT:   .num_work_groups_y: 1024
 ; CHECK-NEXT:   .num_work_groups_z: 1024
+; CHECK-NEXT:   .private_segment_fixed_size: 0

>From 8c2b9775ff89b2ea0dc619afd9397bbf7dac4ee9 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 16 Feb 2024 18:48:36 -0600
Subject: [PATCH 5/7] Update based on code review.

---
 clang/include/clang/Basic/Attr.td             |  8 +--
 clang/include/clang/Basic/AttrDocs.td         | 13 ++---
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 21 ++++----
 clang/lib/Sema/SemaDeclAttr.cpp               | 11 ++--
 ...a-attribute-supported-attributes-list.test |  2 +-
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      | 12 ++---
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |  4 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  2 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  4 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 ++--
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 54 +++++++++----------
 11 files changed, 71 insertions(+), 70 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 1b4718258d91e6..059aa79e08f00b 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2031,10 +2031,10 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
-def AMDGPUNumWorkGroups : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
-  let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
-  let Documentation = [AMDGPUNumWorkGroupsDocs];
+def AMDGPUMaxNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
+  let Args = [ExprArgument<"MaxNumWorkGroupsX">, ExprArgument<"MaxNumWorkGroupsY">, ExprArgument<"MaxNumWorkGroupsZ">];
+  let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index e8fd10587a8022..f9bea390736b4b 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2705,18 +2705,19 @@ An error will be given if:
   }];
 }
 
-def AMDGPUNumWorkGroupsDocs : Documentation {
+def AMDGPUMaxNumWorkGroupsDocs : Documentation {
   let Category = DocCatAMDGPUAttributes;
   let Content = [{
-The number of work groups specifies the number of work groups when the kernel
+This attribute specifies the max number of work groups when the kernel
 is dispatched.
 
 Clang supports the
-``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
-AMDGPU target. This attribute may be attached to a kernel function definition
-and is an optimization hint.
+``__attribute__((amdgpu_max_num_work_groups(<x>, <y>, <z>)))`` or
+``[[clang::amdgpu_max_num_work_groups(<x>, <y>, <z>)]]`` attribute for the
+AMDGPU target. This attribute may be attached to HIP or OpenCL kernel function
+definitions and is an optimization hint.
 
-``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
+``<x>`` parameter specifies the maximum number of work groups in the x dimension.
 Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
 
 If specified, the AMDGPU target backend might be able to produce better machine
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 93321efd26462c..7bff652f1392b6 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -357,17 +357,16 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
 
-  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
-    uint32_t X = Attr->getNumWorkGroupsX();
-    uint32_t Y = Attr->getNumWorkGroupsY();
-    uint32_t Z = Attr->getNumWorkGroupsZ();
-
-    if (X != 0 && Y != 0 && Z != 0) {
-      std::string AttrVal = llvm::utostr(X) + std::string(", ") +
-                            llvm::utostr(Y) + std::string(", ") +
-                            llvm::utostr(Z);
-      F->addFnAttr("amdgpu-num-work-groups", AttrVal);
-    }
+  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
+    uint32_t X = Attr->getMaxNumWorkGroupsX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+    uint32_t Y = Attr->getMaxNumWorkGroupsY()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+    uint32_t Z = Attr->getMaxNumWorkGroupsZ()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+
+    llvm::SmallString<32> AttrVal;
+    llvm::raw_svector_ostream OS(AttrVal);
+    OS << X << ", " << Y << ", " << Z;
+     
+    F->addFnAttr("amdgpu-max-num-work-groups", AttrVal.str());
   }
 }
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 98d1726bb3e0b8..a38391bda1d1ab 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,7 +8069,7 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
-static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
+static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
                                           const ParsedAttr &AL) {
   uint32_t NumWGX = 0;
   uint32_t NumWGY = 0;
@@ -8084,8 +8084,9 @@ static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
   if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
     return;
 
-  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
-                                                       NumWGY, NumWGZ));
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ !=0)
+    D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(S.Context, AL, NumWGXExpr,
+                                                       NumWGYExpr, NumWGZExpr));
 }
 
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
@@ -9192,8 +9193,8 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
-  case ParsedAttr::AT_AMDGPUNumWorkGroups:
-    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+  case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
+    handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
     break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 3d12656612eb06..217f50fa6adc21 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -4,9 +4,9 @@
 
 // CHECK: #pragma clang attribute supports the following attributes:
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
-// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 4ee48c6fe79088..ba933574e3a0cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,13 +494,13 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
-  unsigned NumWGX = MFI.getNumWorkGroupsX();
-  unsigned NumWGY = MFI.getNumWorkGroupsY();
-  unsigned NumWGZ = MFI.getNumWorkGroupsZ();
+  unsigned NumWGX = MFI.getMaxNumWorkGroupsX();
+  unsigned NumWGY = MFI.getMaxNumWorkGroupsY();
+  unsigned NumWGZ = MFI.getMaxNumWorkGroupsZ();
   if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
-    Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
-    Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
-    Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
+    Kern[".max_num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
+    Kern[".max_num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
+    Kern[".max_num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
   }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 4cdf61cf904984..cb7592ef83451b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1110,6 +1110,6 @@ unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
 }
 
 SmallVector<unsigned>
-AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
-  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
+AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-work-groups", 3);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 90c394b6e3b252..e2d8b5d1ce9790 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -289,7 +289,7 @@ class AMDGPUSubtarget {
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
   /// Return the number of work groups for the function.
-  SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
+  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
 
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index fec8650f01d766..d09b5f5d6f32ac 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,8 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
-  NumWorkGroups = ST.getNumWorkGroups(F);
-  assert(NumWorkGroups.size() == 3);
+  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
+  assert(MaxNumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 18ceb282a85db2..7d0c1ba8448e6c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -427,7 +427,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
   // Default/requested number of work groups for the function.
-  SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
+  SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
 
 private:
   unsigned NumUserSGPRs = 0;
@@ -1077,11 +1077,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   bool usesAGPRs(const MachineFunction &MF) const;
 
   /// \returns Default/requested number of work groups for this function.
-  SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
+  SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
 
-  unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
-  unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
-  unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
+  unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
+  unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
+  unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
index 8b1ee07972c86a..1ec932fb227341 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
 
 ; Attribute not specified.
 ; CHECK-LABEL: {{^}}empty_no_attribute:
@@ -8,42 +8,42 @@ entry:
 }
 
 ; Ignore if number of work groups for x dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
-define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_x0:
+define amdgpu_kernel void @empty_max_num_work_groups_x0() #0 {
 entry:
   ret void
 }
-attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
+attributes #0 = {"amdgpu-max-num-work-groups"="0,2,3"}
 
 ; Ignore if number of work groups for y dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
-define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_y0:
+define amdgpu_kernel void @empty_max_num_work_groups_y0() #1 {
 entry:
   ret void
 }
-attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
+attributes #1 = {"amdgpu-max-num-work-groups"="1,0,3"}
 
 ; Ignore if number of work groups for z dimension is 0.
-; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
-define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_z0:
+define amdgpu_kernel void @empty_max_num_work_groups_z0() #2 {
 entry:
   ret void
 }
-attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
+attributes #2 = {"amdgpu-max-num-work-groups"="1,2,0"}
 
-; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
-define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_max_num_work_groups_1_2_3() #3 {
 entry:
   ret void
 }
-attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
+attributes #3 = {"amdgpu-max-num-work-groups"="1,2,3"}
 
-; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
-define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
+; CHECK-LABEL: {{^}}empty_max_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_max_num_work_groups_1024_1024_1024() #4 {
 entry:
   ret void
 }
-attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
+attributes #4 = {"amdgpu-max-num-work-groups"="1024,1024,1024"}
 
 
 ; CHECK: .amdgpu_metadata
@@ -54,31 +54,31 @@ attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
 
 ; CHECK: - .args:
 ; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_num_work_groups_x0
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_x0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
 
 ; CHECK: - .args:
 ; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_num_work_groups_y0
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_y0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
 
 ; CHECK: - .args:
 ; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_num_work_groups_z0
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_z0
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
 
 ; CHECK: - .args:
 ; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_num_work_groups_1_2_3
-; CHECK-NEXT:   .num_work_groups_x: 1
-; CHECK-NEXT:   .num_work_groups_y: 2
-; CHECK-NEXT:   .num_work_groups_z: 3
+; CHECK-NEXT:   .max_num_work_groups_x: 1
+; CHECK-NEXT:   .max_num_work_groups_y: 2
+; CHECK-NEXT:   .max_num_work_groups_z: 3
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_1_2_3
 ; CHECK-NEXT:   .private_segment_fixed_size: 0
 
 ; CHECK: - .args:
 ; CHECK:        .max_flat_workgroup_size: 1024
-; CHECK-NEXT:   .name:           empty_num_work_groups_1024_1024_1024
-; CHECK-NEXT:   .num_work_groups_x: 1024
-; CHECK-NEXT:   .num_work_groups_y: 1024
-; CHECK-NEXT:   .num_work_groups_z: 1024
+; CHECK-NEXT:   .max_num_work_groups_x: 1024
+; CHECK-NEXT:   .max_num_work_groups_y: 1024
+; CHECK-NEXT:   .max_num_work_groups_z: 1024
+; CHECK-NEXT:   .name:           empty_max_num_work_groups_1024_1024_1024
 ; CHECK-NEXT:   .private_segment_fixed_size: 0

>From 88d3609adbef550345c4a4640422e356484a5937 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 16 Feb 2024 19:13:57 -0600
Subject: [PATCH 6/7] Code formatting.

---
 clang/lib/CodeGen/Targets/AMDGPU.cpp | 14 ++++++++++----
 clang/lib/Sema/SemaDeclAttr.cpp      |  8 ++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 7bff652f1392b6..c1d8baa94c3edd 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -358,14 +358,20 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
   }
 
   if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
-    uint32_t X = Attr->getMaxNumWorkGroupsX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
-    uint32_t Y = Attr->getMaxNumWorkGroupsY()->EvaluateKnownConstInt(M.getContext()).getExtValue();
-    uint32_t Z = Attr->getMaxNumWorkGroupsZ()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+    uint32_t X = Attr->getMaxNumWorkGroupsX()
+                     ->EvaluateKnownConstInt(M.getContext())
+                     .getExtValue();
+    uint32_t Y = Attr->getMaxNumWorkGroupsY()
+                     ->EvaluateKnownConstInt(M.getContext())
+                     .getExtValue();
+    uint32_t Z = Attr->getMaxNumWorkGroupsZ()
+                     ->EvaluateKnownConstInt(M.getContext())
+                     .getExtValue();
 
     llvm::SmallString<32> AttrVal;
     llvm::raw_svector_ostream OS(AttrVal);
     OS << X << ", " << Y << ", " << Z;
-     
+
     F->addFnAttr("amdgpu-max-num-work-groups", AttrVal.str());
   }
 }
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index a38391bda1d1ab..ac26187eb800da 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8070,7 +8070,7 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 }
 
 static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
-                                          const ParsedAttr &AL) {
+                                             const ParsedAttr &AL) {
   uint32_t NumWGX = 0;
   uint32_t NumWGY = 0;
   uint32_t NumWGZ = 0;
@@ -8084,9 +8084,9 @@ static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
   if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
     return;
 
-  if (NumWGX != 0 && NumWGY != 0 && NumWGZ !=0)
-    D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(S.Context, AL, NumWGXExpr,
-                                                       NumWGYExpr, NumWGZExpr));
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0)
+    D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
+        S.Context, AL, NumWGXExpr, NumWGYExpr, NumWGZExpr));
 }
 
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,

>From 5e5e8a478fe2c3d682456bca02e1d9a33c56bc9b Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 19 Feb 2024 13:04:36 -0600
Subject: [PATCH 7/7] Update AMDGPUUsage.rst and AttrDocs.td.

---
 clang/include/clang/Basic/AttrDocs.td |  2 ++
 llvm/docs/AMDGPUUsage.rst             | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index f9bea390736b4b..e79e94a5f14193 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2719,6 +2719,8 @@ definitions and is an optimization hint.
 
 ``<x>`` parameter specifies the maximum number of work groups in the x dimension.
 Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
+Each of the three numbers must be >=1. The attribute is ignored if any of the
+three numbers is 0.
 
 If specified, the AMDGPU target backend might be able to produce better machine
 code.
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6b2417143ca06c..34112c3ec85dda 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1312,6 +1312,11 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                              the frame. This is an internal detail of how LDS variables are lowered,
                                              language front ends should not set this attribute.
 
+     "amdgpu-max-num-work-groups"="x,y,z"    Specify the maximum number of work groups for the kernel dispatch in the
+                                             X, Y, and Z dimensions. Generated by the ``amdgpu_max_num_work_groups``
+                                             CLANG attribute [CLANG-ATTR]_. Clang only emits this attribute when all
+                                             the three numbers are >= 1.
+
      ======================================= ==========================================================
 
 Calling Conventions
@@ -3740,6 +3745,11 @@ same *vendor-name*.
 
                                                                   If omitted, "normal" is
                                                                   assumed.
+     ".max_num_work_groups_{x,y,z}"      integer                  The max number of
+                                                                  launched work-groups
+                                                                  in the X, Y, and Z
+                                                                  dimensions. Each number
+                                                                  must be >=1.
      =================================== ============== ========= ================================
 
 ..



More information about the cfe-commits mailing list