[llvm] [clang] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #75647)

Fri Dec 15 12:14:22 PST 2023

https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/75647

>From bb15eebae9645e5383f26066093c0734ea76442d Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 15 Dec 2023 13:53:54 -0600
Subject: [PATCH 1/2] [AMDGPU] Adding the amdgpu-num-work-groups function
 attribute

A new function attribute named amdgpu-num-work-groups is added.
This attribute allows programmers to let the compiler know the
number of workgroups to be launched and do optimizations based
on that information.
---
 clang/include/clang/Basic/Attr.td             |  7 ++
 clang/include/clang/Basic/AttrDocs.td         | 23 ++++++
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  7 ++
 clang/lib/Sema/SemaDeclAttr.cpp               | 13 +++
 ...a-attribute-supported-attributes-list.test |  1 +
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  4 +
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |  6 ++
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  3 +
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  1 +
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  9 ++
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    | 15 ++++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  8 ++
 .../AMDGPU/attr-amdgpu-num-work-groups.ll     | 82 +++++++++++++++++++
 13 files changed, 179 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 5943583d92773a..605fcbbff027b9 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2011,6 +2011,13 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
+def AMDGPUNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"NumWorkGroups">];
+  let Documentation = [AMDGPUNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
 def AMDGPUKernelCall : DeclOrTypeAttr {
   let Spellings = [Clang<"amdgpu_kernel">];
   let Documentation = [Undocumented];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 77950ab6d877ea..0bf3ccf367284c 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2693,6 +2693,29 @@ An error will be given if:
   }];
 }
 
+def AMDGPUNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+The number of work groups specifies the number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_num_work_groups(<num>)))`` attribute for the
+AMDGPU target. This attribute may be attached to a kernel function definition
+and is an optimization hint.
+
+``<num>`` parameter specifies the number of work groups.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
   let Content = [{
 Clang supports several different calling conventions, depending on the target
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 03ac6b78598fc8..11a0835f37f4a9 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -356,6 +356,13 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
+    uint32_t NumWG = Attr->getNumWorkGroups();
+
+    if (NumWG != 0)
+      F->addFnAttr("amdgpu-num-work-groups", llvm::utostr(NumWG));
+  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 5b29b05dee54b3..3737dd256aff02 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8051,6 +8051,16 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
+static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
+                                          const ParsedAttr &AL) {
+  uint32_t NumWG = 0;
+  Expr *NumWGExpr = AL.getArgAsExpr(0);
+  if (!checkUInt32Argument(S, AL, NumWGExpr, NumWG))
+    return;
+
+  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWG));
+}
+
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
                                               const ParsedAttr &AL) {
   // If we try to apply it to a function pointer, don't warn, but don't
@@ -9058,6 +9068,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_AMDGPUNumWorkGroups:
+    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
     break;
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index bdfda430eea86c..d42bb52cc8bcfa 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -6,6 +6,7 @@
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b51a876750b58b..b9ede45e174a7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,6 +494,10 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
+  unsigned NumWG = MFI.getNumWorkGroups();
+  if (NumWG != 0) {
+    Kern[".num_work_groups"] = Kern.getDocument()->getNode(NumWG);
+  }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
   Kern[".vgpr_spill_count"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index f19c5766856408..d7f5c456706ecd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1108,3 +1108,9 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
 }
+
+unsigned AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
+  const unsigned Default = 0;
+  return AMDGPU::getUnsignedIntegerAttribute(F, "amdgpu-num-work-groups", Default);
+}
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index b72697973be7a1..b791399c38dff8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,6 +288,9 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
+  /// Return the number of work groups for the function.
+  unsigned getNumWorkGroups(const Function &F) const;
+
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 48c341917ddec7..2f483e18544a78 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,6 +46,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
+  NumWorkGroups = ST.getNumWorkGroups(F);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7ff50c80081d30..fc244552f40da8 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
+  // Default/requested number of work groups for the function.
+  unsigned NumWorkGroups = 0;
+
 private:
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
@@ -1094,6 +1097,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   // \returns true if a function needs or may need AGPRs.
   bool usesAGPRs(const MachineFunction &MF) const;
+
+  /// \returns Default/requested number of work groups for this function.
+  unsigned getNumWorkGroups() const {
+    return NumWorkGroups;
+  }
+
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4edd7960bd8c40..82e3bca7ab73b1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1221,6 +1221,21 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
   return Ints;
 }
 
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name, unsigned Default) {
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  LLVMContext &Ctx = F.getContext();
+  unsigned IntVal = Default;
+  StringRef Str = A.getValueAsString();
+  if (Str.trim().getAsInteger(0, IntVal)) {
+    Ctx.emitError("can't parse integer attribute " + Name);
+    return Default;
+  }
+  return IntVal;
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3c9f330cbcded9..c54c1638fa97a1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -818,6 +818,14 @@ bool shouldEmitConstantsToTextSection(const Triple &TT);
 /// to integer.
 int getIntegerAttribute(const Function &F, StringRef Name, int Default);
 
+/// \returns Unsigned Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name, unsigned Default);
+
 /// \returns A pair of integer values requested using \p F's \p Name attribute
 /// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
 /// is false).
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
new file mode 100644
index 00000000000000..315cd7dc0c0d91
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+
+; Attribute not specified.
+; CHECK-LABEL: {{^}}empty_no_attribute:
+define amdgpu_kernel void @empty_no_attribute() {
+entry:
+  ret void
+}
+
+; Ignore if number of work groups is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_0:
+define amdgpu_kernel void @empty_num_work_groups_0() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-num-work-groups"="0"}
+
+; Exactly 1 work group.
+; CHECK-LABEL: {{^}}empty_num_work_groups_1:
+define amdgpu_kernel void @empty_num_work_groups_1() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-num-work-groups"="1"}
+
+; Exactly 5 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_5:
+define amdgpu_kernel void @empty_num_work_groups_5() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-num-work-groups"="5"}
+
+; Exactly 32 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_32:
+define amdgpu_kernel void @empty_num_work_groups_32() #3 {
+entry:
+  ret void
+}
+attributes #3 = {"amdgpu-num-work-groups"="32"}
+
+; Exactly 50 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_50:
+define amdgpu_kernel void @empty_num_work_groups_50() #4 {
+entry:
+  ret void
+}
+attributes #4 = {"amdgpu-num-work-groups"="50"}
+
+; Exactly 256 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_256:
+define amdgpu_kernel void @empty_num_work_groups_256() #5 {
+entry:
+  ret void
+}
+attributes #5 = {"amdgpu-num-work-groups"="256"}
+
+; Exactly 1024 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_1024:
+define amdgpu_kernel void @empty_num_work_groups_1024() #6 {
+entry:
+  ret void
+}
+attributes #6 = {"amdgpu-num-work-groups"="1024"}
+
+; CHECK: .amdgpu_metadata
+; CHECK:        .name:           empty_no_attribute
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_1
+; CHECK-NEXT:   .num_work_groups: 1
+; CHECK:        .name:           empty_num_work_groups_5
+; CHECK-NEXT:   .num_work_groups: 5
+; CHECK:        .name:           empty_num_work_groups_32
+; CHECK-NEXT:   .num_work_groups: 32
+; CHECK:        .name:           empty_num_work_groups_50
+; CHECK-NEXT:   .num_work_groups: 50
+; CHECK:        .name:           empty_num_work_groups_256
+; CHECK-NEXT:   .num_work_groups: 256
+; CHECK:        .name:           empty_num_work_groups_1024
+; CHECK-NEXT:   .num_work_groups: 1024

>From b5bceb99a4ab70509cf17ea223494bd718b5f62f Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Fri, 15 Dec 2023 14:13:20 -0600
Subject: [PATCH 2/2] Fix formatting.

---
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp      | 4 ++--
 llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h  | 5 +----
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 3 ++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h   | 3 ++-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d7f5c456706ecd..d69a78d3664bcd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1111,6 +1111,6 @@ unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
 
 unsigned AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
   const unsigned Default = 0;
-  return AMDGPU::getUnsignedIntegerAttribute(F, "amdgpu-num-work-groups", Default);
+  return AMDGPU::getUnsignedIntegerAttribute(F, "amdgpu-num-work-groups",
+                                             Default);
 }
-
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index fc244552f40da8..1ab6d2bca902ae 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1099,10 +1099,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   bool usesAGPRs(const MachineFunction &MF) const;
 
   /// \returns Default/requested number of work groups for this function.
-  unsigned getNumWorkGroups() const {
-    return NumWorkGroups;
-  }
-
+  unsigned getNumWorkGroups() const { return NumWorkGroups; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 82e3bca7ab73b1..1a763120e04bf1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1221,7 +1221,8 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
   return Ints;
 }
 
-unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name, unsigned Default) {
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
+                                     unsigned Default) {
   Attribute A = F.getFnAttribute(Name);
   if (!A.isStringAttribute())
     return Default;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index c54c1638fa97a1..f395384a2e0899 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -824,7 +824,8 @@ int getIntegerAttribute(const Function &F, StringRef Name, int Default);
 ///
 /// \returns \p Default and emits error if requested value cannot be converted
 /// to integer.
-unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name, unsigned Default);
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
+                                     unsigned Default);
 
 /// \returns A pair of integer values requested using \p F's \p Name attribute
 /// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired