[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)

Mon Jan 22 10:50:58 PST 2024

https://github.com/jwanggit86 created https://github.com/llvm/llvm-project/pull/79035

A new function attribute named amdgpu-num-work-groups is added. This attribute, which consists of three integers, allows programmers to let the compiler know the number of workgroups to be launched in each of the three dimensions and do optimizations based on that information.

>From 8ed74e15d0798bf7d171c912e6268eba1d760b64 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Mon, 22 Jan 2024 12:43:27 -0600
Subject: [PATCH] [AMDGPU] Adding the amdgpu-num-work-groups function attribute

A new function attribute named amdgpu-num-work-groups is added.
This attribute, which consists of three integers, allows programmers
to let the compiler know the number of workgroups to be launched in
each of the three dimensions and do optimizations based on that
information.
---
 clang/include/clang/Basic/Attr.td             |  7 ++
 clang/include/clang/Basic/AttrDocs.td         | 24 +++++++
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 13 ++++
 clang/lib/Sema/SemaDeclAttr.cpp               | 22 +++++++
 ...a-attribute-supported-attributes-list.test |  1 +
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  8 +++
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |  5 ++
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  3 +
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  2 +
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +++
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    | 53 +++++++++++++++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 19 ++++++
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 65 +++++++++++++++++++
 13 files changed, 232 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 78a9229aeaf081..5251858ac3bfd7 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2031,6 +2031,13 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
+def AMDGPUNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
+  let Documentation = [AMDGPUNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
 def AMDGPUKernelCall : DeclOrTypeAttr {
   let Spellings = [Clang<"amdgpu_kernel">];
   let Documentation = [Undocumented];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 9e8190614fbe8a..268d15eddab16f 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2702,6 +2702,30 @@ An error will be given if:
   }];
 }
 
+def AMDGPUNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+The number of work groups specifies the number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+AMDGPU target. This attribute may be attached to a kernel function definition
+and is an optimization hint.
+
+``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
+Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
   let Content = [{
 Clang supports several different calling conventions, depending on the target
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 03ac6b78598fc8..93321efd26462c 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -356,6 +356,19 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
+    uint32_t X = Attr->getNumWorkGroupsX();
+    uint32_t Y = Attr->getNumWorkGroupsY();
+    uint32_t Z = Attr->getNumWorkGroupsZ();
+
+    if (X != 0 && Y != 0 && Z != 0) {
+      std::string AttrVal = llvm::utostr(X) + std::string(", ") +
+                            llvm::utostr(Y) + std::string(", ") +
+                            llvm::utostr(Z);
+      F->addFnAttr("amdgpu-num-work-groups", AttrVal);
+    }
+  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index a482919356e1bc..dc1c951031d58c 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8072,6 +8072,25 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
+static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
+                                          const ParsedAttr &AL) {
+  uint32_t NumWGX = 0;
+  uint32_t NumWGY = 0;
+  uint32_t NumWGZ = 0;
+  Expr *NumWGXExpr = AL.getArgAsExpr(0);
+  Expr *NumWGYExpr = AL.getArgAsExpr(1);
+  Expr *NumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
+    return;
+  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
+    return;
+  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
+    return;
+
+  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
+                                                       NumWGY, NumWGZ));
+}
+
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
                                               const ParsedAttr &AL) {
   // If we try to apply it to a function pointer, don't warn, but don't
@@ -9170,6 +9189,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_AMDGPUNumWorkGroups:
+    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
     break;
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index e476c15b35ded9..3d12656612eb06 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -6,6 +6,7 @@
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 74e9cd7d09654c..b1eb701d18db89 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,6 +494,14 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
+  unsigned NumWGX = MFI.getNumWorkGroupsX();
+  unsigned NumWGY = MFI.getNumWorkGroupsY();
+  unsigned NumWGZ = MFI.getNumWorkGroupsZ();
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
+    Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
+    Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
+    Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
+  }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
   Kern[".vgpr_spill_count"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index f19c5766856408..c02d7c6387c11d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1108,3 +1108,8 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
 }
+
+SmallVector<unsigned>
+AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index b72697973be7a1..90c394b6e3b252 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,6 +288,9 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
+  /// Return the number of work groups for the function.
+  SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
+
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index e8142244b7db69..58f5fe415e84d2 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,6 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
+  NumWorkGroups = ST.getNumWorkGroups(F);
+  assert(NumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index dc63ae44c528db..29c75309bdebe6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
+  // Default/requested number of work groups for the function.
+  SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
+
 private:
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
@@ -1095,6 +1098,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   // \returns true if a function needs or may need AGPRs.
   bool usesAGPRs(const MachineFunction &MF) const;
+
+  /// \returns Default/requested number of work groups for this function.
+  SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
+
+  unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
+  unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
+  unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b4f7fc456f0bdd..9ef02b83f55b43 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUAsmUtils.h"
 #include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
@@ -1261,6 +1262,58 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
   return Ints;
 }
 
+SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
+                                             unsigned Size) {
+  assert(Size > 2);
+  SmallVector<unsigned> Default(Size, 0);
+
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  SmallVector<unsigned> Vals(Size, 0);
+
+  LLVMContext &Ctx = F.getContext();
+
+  StringRef S = A.getValueAsString();
+  unsigned i = 0;
+  for (; !S.empty() && i < Size; i++) {
+    std::pair<StringRef, StringRef> Strs = S.split(',');
+    unsigned IntVal;
+    if (Strs.first.trim().getAsInteger(0, IntVal)) {
+      Ctx.emitError("can't parse integer attribute " + Strs.first + " in " +
+                    Name);
+      return Default;
+    }
+    Vals[i] = IntVal;
+    S = Strs.second;
+  }
+
+  if (!S.empty() || i < Size) {
+    Ctx.emitError("attribute " + Name +
+                  " has incorrect number of integers; expected " +
+                  llvm::utostr(Size));
+    return Default;
+  }
+  return Vals;
+}
+
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
+                                     unsigned Default) {
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  LLVMContext &Ctx = F.getContext();
+  unsigned IntVal = Default;
+  StringRef Str = A.getValueAsString();
+  if (Str.trim().getAsInteger(0, IntVal)) {
+    Ctx.emitError("can't parse integer attribute " + Name);
+    return Default;
+  }
+  return IntVal;
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 351563e957f14a..ee2b68c13617a8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -818,6 +818,15 @@ bool shouldEmitConstantsToTextSection(const Triple &TT);
 /// to integer.
 int getIntegerAttribute(const Function &F, StringRef Name, int Default);
 
+/// \returns Unsigned Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
+                                     unsigned Default);
+
 /// \returns A pair of integer values requested using \p F's \p Name attribute
 /// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
 /// is false).
@@ -832,6 +841,16 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
                         std::pair<unsigned, unsigned> Default,
                         bool OnlyFirstRequired = false);
 
+/// \returns Generate a vector of integer values requested using \p F's \p Name
+/// attribute.
+///
+/// \returns true if exactly Size (>2) number of integers are found in the
+/// attribute.
+///
+/// \returns false if any error occurs.
+SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
+                                             unsigned Size);
+
 /// Represents the counter values to wait for in an s_waitcnt instruction.
 ///
 /// Large values (including the maximum possible integer) can be used to
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
new file mode 100644
index 00000000000000..6fc6de91d1d030
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -0,0 +1,65 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+
+; Attribute not specified.
+; CHECK-LABEL: {{^}}empty_no_attribute:
+define amdgpu_kernel void @empty_no_attribute() {
+entry:
+  ret void
+}
+
+; Ignore if number of work groups for x dimension is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
+define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
+
+; Ignore if number of work groups for y dimension is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
+define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
+
+; Ignore if number of work groups for z dimension is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
+define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
+
+; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
+define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
+entry:
+  ret void
+}
+attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
+define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
+entry:
+  ret void
+}
+attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
+
+
+; CHECK: .amdgpu_metadata
+; CHECK:        .name:           empty_no_attribute
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_x0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_y0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_z0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_1_2_3
+; CHECK-NEXT:   .num_work_groups_x: 1
+; CHECK-NEXT:   .num_work_groups_y: 2
+; CHECK-NEXT:   .num_work_groups_z: 3
+; CHECK:        .name:           empty_num_work_groups_1024_1024_1024
+; CHECK-NEXT:   .num_work_groups_x: 1024
+; CHECK-NEXT:   .num_work_groups_y: 1024
+; CHECK-NEXT:   .num_work_groups_z: 1024