[llvm] 664a226 - AMDGPU: Propagate amdgpu-max-num-workgroups attribute (#113018)

Mon Dec 9 07:57:33 PST 2024

Author: Matt Arsenault
Date: 2024-12-09T09:57:27-06:00
New Revision: 664a226bf616a7dd6e1934cf45f84f1d99e8fed0

URL: https://github.com/llvm/llvm-project/commit/664a226bf616a7dd6e1934cf45f84f1d99e8fed0
DIFF: https://github.com/llvm/llvm-project/commit/664a226bf616a7dd6e1934cf45f84f1d99e8fed0.diff

LOG: AMDGPU: Propagate amdgpu-max-num-workgroups attribute (#113018)

I'm not sure what the interpretation of 0 is supposed to be,
AMDGPUUsage doesn't say.

Added: 
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 2ae34636005eac..b6e968ef82c962 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -179,6 +179,11 @@ class AMDGPUInformationCache : public InformationCache {
     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
   }
 
+  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    return ST.getMaxNumWorkGroups(F);
+  }
+
   /// Get code object version.
   unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
 
@@ -821,6 +826,145 @@ AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
       "AAAMDFlatWorkGroupSize is only valid for function position");
 }
 
+struct TupleDecIntegerRangeState : public AbstractState {
+  DecIntegerState<uint32_t> X, Y, Z;
+
+  bool isValidState() const override {
+    return X.isValidState() && Y.isValidState() && Z.isValidState();
+  }
+
+  bool isAtFixpoint() const override {
+    return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
+  }
+
+  ChangeStatus indicateOptimisticFixpoint() override {
+    return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
+           Z.indicateOptimisticFixpoint();
+  }
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
+           Z.indicatePessimisticFixpoint();
+  }
+
+  TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
+    X ^= Other.X;
+    Y ^= Other.Y;
+    Z ^= Other.Z;
+    return *this;
+  }
+
+  bool operator==(const TupleDecIntegerRangeState &Other) const {
+    return X == Other.X && Y == Other.Y && Z == Other.Z;
+  }
+
+  TupleDecIntegerRangeState &getAssumed() { return *this; }
+  const TupleDecIntegerRangeState &getAssumed() const { return *this; }
+};
+
+using AAAMDMaxNumWorkgroupsState =
+    StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
+
+/// Propagate amdgpu-max-num-workgroups attribute.
+struct AAAMDMaxNumWorkgroups
+    : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
+  using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
+
+  AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+  void initialize(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+    SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
+
+    X.takeKnownMinimum(MaxNumWorkgroups[0]);
+    Y.takeKnownMinimum(MaxNumWorkgroups[1]);
+    Z.takeKnownMinimum(MaxNumWorkgroups[2]);
+
+    if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+      indicatePessimisticFixpoint();
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+    auto CheckCallSite = [&](AbstractCallSite CS) {
+      Function *Caller = CS.getInstruction()->getFunction();
+      LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
+                        << "->" << getAssociatedFunction()->getName() << '\n');
+
+      const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
+          *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+      if (!CallerInfo || !CallerInfo->isValidState())
+        return false;
+
+      Change |=
+          clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
+      return true;
+    };
+
+    bool AllCallSitesKnown = true;
+    if (!A.checkForAllCallSites(CheckCallSite, *this,
+                                /*RequireAllCallSites=*/true,
+                                AllCallSitesKnown))
+      return indicatePessimisticFixpoint();
+
+    return Change;
+  }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
+                                                  Attributor &A);
+
+  ChangeStatus manifest(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    LLVMContext &Ctx = F->getContext();
+    SmallString<32> Buffer;
+    raw_svector_ostream OS(Buffer);
+    OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
+
+    // TODO: Should annotate loads of the group size for this to do anything
+    // useful.
+    return A.manifestAttrs(
+        getIRPosition(),
+        {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
+        /* ForceReplace= */ true);
+  }
+
+  const std::string getName() const override { return "AAAMDMaxNumWorkgroups"; }
+
+  const std::string getAsStr(Attributor *) const override {
+    std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
+    raw_string_ostream OS(Buffer);
+    OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
+       << ']';
+    return OS.str();
+  }
+
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAAMDMaxNumWorkgroups
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  void trackStatistics() const override {}
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+const char AAAMDMaxNumWorkgroups::ID = 0;
+
+AAAMDMaxNumWorkgroups &
+AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
+  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+    return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
+  llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
+}
+
 /// Propagate amdgpu-waves-per-eu attribute.
 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
@@ -1046,8 +1190,8 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
   DenseSet<const char *> Allowed(
       {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
        &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
-       &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
-       &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
+       &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
+       &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
        &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
        &AAInstanceInfo::ID});
 
@@ -1071,6 +1215,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
   for (auto *F : Functions) {
     A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
     A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
+    A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
     A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
     CallingConv::ID CC = F->getCallingConv();
     if (!AMDGPU::isEntryFunctionCC(CC)) {

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
new file mode 100644
index 00000000000000..366432e0fc6cb1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
+
+; External call to avoid inferring argument attributes. This makes the
+; final attribute groups easier to read
+declare void @dummy()
+
+define void @extern_callee() {
+; CHECK-LABEL: define void @extern_callee(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define internal void @callee_1_2_3() {
+; CHECK-LABEL: define internal void @callee_1_2_3(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_1_2_3() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_1_2_3(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @callee_1_2_3()
+; CHECK-NEXT:    call void @extern_callee()
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @callee_1_2_3()
+  call void @extern_callee()
+  call void @dummy()
+  ret void
+}
+
+attributes #0 = {"amdgpu-max-num-workgroups"="1,2,3"}
+
+; -> 100,10,99
+define internal void @callee_merge_100_8_32__16_10_99() {
+; CHECK-LABEL: define internal void @callee_merge_100_8_32__16_10_99(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_100_8_32() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_100_8_32(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    call void @callee_merge_100_8_32__16_10_99()
+; CHECK-NEXT:    ret void
+;
+  call void @callee_merge_100_8_32__16_10_99()
+  ret void
+}
+
+define amdgpu_cs void @amdgpu_cs_100_8_32() #1 {
+; CHECK-LABEL: define amdgpu_cs void @amdgpu_cs_100_8_32(
+; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-NEXT:    call void @callee_merge_100_8_32__16_10_99()
+; CHECK-NEXT:    ret void
+;
+  call void @callee_merge_100_8_32__16_10_99()
+  ret void
+}
+
+attributes #1 = {"amdgpu-max-num-workgroups"="100,8,32"}
+
+define amdgpu_kernel void @kernel_16_10_99() #2 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_16_10_99(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    call void @callee_merge_100_8_32__16_10_99()
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @callee_merge_100_8_32__16_10_99()
+  call void @dummy()
+  ret void
+}
+
+attributes #2 = {"amdgpu-max-num-workgroups"="16,10,99"}
+
+define internal void @merge_to_worst_case() {
+; CHECK-LABEL: define internal void @merge_to_worst_case(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define internal void @callee_x_worst_case() {
+; CHECK-LABEL: define internal void @callee_x_worst_case(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_x_maximum() #3 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_x_maximum(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT:    call void @merge_to_worst_case()
+; CHECK-NEXT:    call void @callee_x_worst_case()
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_to_worst_case()
+  call void @callee_x_worst_case()
+  call void @dummy()
+  ret void
+}
+
+attributes #3 = {"amdgpu-max-num-workgroups"="4294967295,1,1"}
+
+define amdgpu_kernel void @kernel_y_maximum() #4 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_y_maximum(
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT:    call void @merge_to_worst_case()
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_to_worst_case()
+  call void @dummy()
+  ret void
+}
+
+attributes #4 = {"amdgpu-max-num-workgroups"="1,4294967295,1"}
+
+define amdgpu_kernel void @kernel_z_maximum() #5 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_z_maximum(
+; CHECK-SAME: ) #[[ATTR7:[0-9]+]] {
+; CHECK-NEXT:    call void @merge_to_worst_case()
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_to_worst_case()
+  call void @dummy()
+  ret void
+}
+
+attributes #5 = {"amdgpu-max-num-workgroups"="1,1,4294967295"}
+
+; Make sure the attribute isn't lost from the callee.
+define internal void @annotated_callee_from_unannotated_kernel() #6 {
+; CHECK-LABEL: define internal void @annotated_callee_from_unannotated_kernel(
+; CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+attributes #6 = {"amdgpu-max-num-workgroups"="42,99,123"}
+
+define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee()  {
+; CHECK-LABEL: define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @annotated_callee_from_unannotated_kernel()
+; CHECK-NEXT:    ret void
+;
+  call void @annotated_callee_from_unannotated_kernel()
+  ret void
+}
+
+
+define internal void @annotated_callee_merge_caller() #7 {
+; CHECK-LABEL: define internal void @annotated_callee_merge_caller(
+; CHECK-SAME: ) #[[ATTR9:[0-9]+]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+attributes #7 = {"amdgpu-max-num-workgroups"="512,256,1024"}
+
+define amdgpu_kernel void @call_annotated_callee_merge_caller() #8 {
+; CHECK-LABEL: define amdgpu_kernel void @call_annotated_callee_merge_caller(
+; CHECK-SAME: ) #[[ATTR10:[0-9]+]] {
+; CHECK-NEXT:    call void @annotated_callee_merge_caller()
+; CHECK-NEXT:    ret void
+;
+  call void @annotated_callee_merge_caller()
+  ret void
+}
+
+attributes #8 = {"amdgpu-max-num-workgroups"="256,128,2048"}
+
+define internal void @called_by_explicit_worst_case() {
+; CHECK-LABEL: define internal void @called_by_explicit_worst_case(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    ret void
+;
+  call void @dummy()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_explicit_worst_case() #9 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_explicit_worst_case(
+; CHECK-SAME: ) #[[ATTR11:[0-9]+]] {
+; CHECK-NEXT:    call void @called_by_explicit_worst_case()
+; CHECK-NEXT:    ret void
+;
+  call void @called_by_explicit_worst_case()
+  ret void
+}
+
+attributes #9 = {"amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295"}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="100,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,8,32" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="16,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="42,99,123" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="256,128,1024" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-max-num-workgroups"="256,128,2048" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+;.