[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #75647)

Fri Dec 15 12:01:23 PST 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Jun Wang (jwanggit86)

<details>
<summary>Changes</summary>

A new function attribute named amdgpu-num-work-groups is added. This attribute allows programmers to let the compiler know the number of workgroups to be launched and do optimizations based on that information.

---
Full diff: https://github.com/llvm/llvm-project/pull/75647.diff


13 Files Affected:

- (modified) clang/include/clang/Basic/Attr.td (+7) 
- (modified) clang/include/clang/Basic/AttrDocs.td (+23) 
- (modified) clang/lib/CodeGen/Targets/AMDGPU.cpp (+7) 
- (modified) clang/lib/Sema/SemaDeclAttr.cpp (+13) 
- (modified) clang/test/Misc/pragma-attribute-supported-attributes-list.test (+1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp (+4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (+6) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h (+3) 
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+1) 
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+9) 
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+15) 
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+8) 
- (added) llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll (+82) 


``````````diff

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 5943583d92773a..605fcbbff027b9 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2011,6 +2011,13 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
+def AMDGPUNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"NumWorkGroups">];
+  let Documentation = [AMDGPUNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
 def AMDGPUKernelCall : DeclOrTypeAttr {
   let Spellings = [Clang<"amdgpu_kernel">];
   let Documentation = [Undocumented];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 77950ab6d877ea..0bf3ccf367284c 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2693,6 +2693,29 @@ An error will be given if:
   }];
 }
 
+def AMDGPUNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+The number of work groups specifies the number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_num_work_groups(<num>)))`` attribute for the
+AMDGPU target. This attribute may be attached to a kernel function definition
+and is an optimization hint.
+
+``<num>`` parameter specifies the number of work groups.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
   let Content = [{
 Clang supports several different calling conventions, depending on the target
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 03ac6b78598fc8..11a0835f37f4a9 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -356,6 +356,13 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
+    uint32_t NumWG = Attr->getNumWorkGroups();
+
+    if (NumWG != 0)
+      F->addFnAttr("amdgpu-num-work-groups", llvm::utostr(NumWG));
+  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 5b29b05dee54b3..3737dd256aff02 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8051,6 +8051,16 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
+static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
+                                          const ParsedAttr &AL) {
+  uint32_t NumWG = 0;
+  Expr *NumWGExpr = AL.getArgAsExpr(0);
+  if (!checkUInt32Argument(S, AL, NumWGExpr, NumWG))
+    return;
+
+  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWG));
+}
+
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
                                               const ParsedAttr &AL) {
   // If we try to apply it to a function pointer, don't warn, but don't
@@ -9058,6 +9068,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_AMDGPUNumWorkGroups:
+    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
     break;
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index bdfda430eea86c..d42bb52cc8bcfa 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -6,6 +6,7 @@
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b51a876750b58b..b9ede45e174a7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,6 +494,10 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
+  unsigned NumWG = MFI.getNumWorkGroups();
+  if (NumWG != 0) {
+    Kern[".num_work_groups"] = Kern.getDocument()->getNode(NumWG);
+  }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
   Kern[".vgpr_spill_count"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index f19c5766856408..d7f5c456706ecd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1108,3 +1108,9 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
 }
+
+unsigned AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
+  const unsigned Default = 0;
+  return AMDGPU::getUnsignedIntegerAttribute(F, "amdgpu-num-work-groups", Default);
+}
+
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index b72697973be7a1..b791399c38dff8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,6 +288,9 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
+  /// Return the number of work groups for the function.
+  unsigned getNumWorkGroups(const Function &F) const;
+
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 48c341917ddec7..2f483e18544a78 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,6 +46,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
+  NumWorkGroups = ST.getNumWorkGroups(F);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7ff50c80081d30..fc244552f40da8 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
+  // Default/requested number of work groups for the function.
+  unsigned NumWorkGroups = 0;
+
 private:
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
@@ -1094,6 +1097,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   // \returns true if a function needs or may need AGPRs.
   bool usesAGPRs(const MachineFunction &MF) const;
+
+  /// \returns Default/requested number of work groups for this function.
+  unsigned getNumWorkGroups() const {
+    return NumWorkGroups;
+  }
+
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4edd7960bd8c40..82e3bca7ab73b1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1221,6 +1221,21 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
   return Ints;
 }
 
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name, unsigned Default) {
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  LLVMContext &Ctx = F.getContext();
+  unsigned IntVal = Default;
+  StringRef Str = A.getValueAsString();
+  if (Str.trim().getAsInteger(0, IntVal)) {
+    Ctx.emitError("can't parse integer attribute " + Name);
+    return Default;
+  }
+  return IntVal;
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3c9f330cbcded9..c54c1638fa97a1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -818,6 +818,14 @@ bool shouldEmitConstantsToTextSection(const Triple &TT);
 /// to integer.
 int getIntegerAttribute(const Function &F, StringRef Name, int Default);
 
+/// \returns Unsigned Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name, unsigned Default);
+
 /// \returns A pair of integer values requested using \p F's \p Name attribute
 /// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
 /// is false).
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
new file mode 100644
index 00000000000000..315cd7dc0c0d91
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+
+; Attribute not specified.
+; CHECK-LABEL: {{^}}empty_no_attribute:
+define amdgpu_kernel void @empty_no_attribute() {
+entry:
+  ret void
+}
+
+; Ignore if number of work groups is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_0:
+define amdgpu_kernel void @empty_num_work_groups_0() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-num-work-groups"="0"}
+
+; Exactly 1 work group.
+; CHECK-LABEL: {{^}}empty_num_work_groups_1:
+define amdgpu_kernel void @empty_num_work_groups_1() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-num-work-groups"="1"}
+
+; Exactly 5 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_5:
+define amdgpu_kernel void @empty_num_work_groups_5() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-num-work-groups"="5"}
+
+; Exactly 32 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_32:
+define amdgpu_kernel void @empty_num_work_groups_32() #3 {
+entry:
+  ret void
+}
+attributes #3 = {"amdgpu-num-work-groups"="32"}
+
+; Exactly 50 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_50:
+define amdgpu_kernel void @empty_num_work_groups_50() #4 {
+entry:
+  ret void
+}
+attributes #4 = {"amdgpu-num-work-groups"="50"}
+
+; Exactly 256 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_256:
+define amdgpu_kernel void @empty_num_work_groups_256() #5 {
+entry:
+  ret void
+}
+attributes #5 = {"amdgpu-num-work-groups"="256"}
+
+; Exactly 1024 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_1024:
+define amdgpu_kernel void @empty_num_work_groups_1024() #6 {
+entry:
+  ret void
+}
+attributes #6 = {"amdgpu-num-work-groups"="1024"}
+
+; CHECK: .amdgpu_metadata
+; CHECK:        .name:           empty_no_attribute
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_1
+; CHECK-NEXT:   .num_work_groups: 1
+; CHECK:        .name:           empty_num_work_groups_5
+; CHECK-NEXT:   .num_work_groups: 5
+; CHECK:        .name:           empty_num_work_groups_32
+; CHECK-NEXT:   .num_work_groups: 32
+; CHECK:        .name:           empty_num_work_groups_50
+; CHECK-NEXT:   .num_work_groups: 50
+; CHECK:        .name:           empty_num_work_groups_256
+; CHECK-NEXT:   .num_work_groups: 256
+; CHECK:        .name:           empty_num_work_groups_1024
+; CHECK-NEXT:   .num_work_groups: 1024

``````````

</details>


https://github.com/llvm/llvm-project/pull/75647