[llvm] 664a226 - AMDGPU: Propagate amdgpu-max-num-workgroups attribute (#113018)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 9 07:57:33 PST 2024
Author: Matt Arsenault
Date: 2024-12-09T09:57:27-06:00
New Revision: 664a226bf616a7dd6e1934cf45f84f1d99e8fed0
URL: https://github.com/llvm/llvm-project/commit/664a226bf616a7dd6e1934cf45f84f1d99e8fed0
DIFF: https://github.com/llvm/llvm-project/commit/664a226bf616a7dd6e1934cf45f84f1d99e8fed0.diff
LOG: AMDGPU: Propagate amdgpu-max-num-workgroups attribute (#113018)
I'm not sure what the interpretation of 0 is supposed to be,
AMDGPUUsage doesn't say.
Added:
llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 2ae34636005eac..b6e968ef82c962 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -179,6 +179,11 @@ class AMDGPUInformationCache : public InformationCache {
return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
}
+ SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ return ST.getMaxNumWorkGroups(F);
+ }
+
/// Get code object version.
unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
@@ -821,6 +826,145 @@ AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
"AAAMDFlatWorkGroupSize is only valid for function position");
}
+struct TupleDecIntegerRangeState : public AbstractState {
+ DecIntegerState<uint32_t> X, Y, Z;
+
+ bool isValidState() const override {
+ return X.isValidState() && Y.isValidState() && Z.isValidState();
+ }
+
+ bool isAtFixpoint() const override {
+ return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
+ }
+
+ ChangeStatus indicateOptimisticFixpoint() override {
+ return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
+ Z.indicateOptimisticFixpoint();
+ }
+
+ ChangeStatus indicatePessimisticFixpoint() override {
+ return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
+ Z.indicatePessimisticFixpoint();
+ }
+
+ TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
+ X ^= Other.X;
+ Y ^= Other.Y;
+ Z ^= Other.Z;
+ return *this;
+ }
+
+ bool operator==(const TupleDecIntegerRangeState &Other) const {
+ return X == Other.X && Y == Other.Y && Z == Other.Z;
+ }
+
+ TupleDecIntegerRangeState &getAssumed() { return *this; }
+ const TupleDecIntegerRangeState &getAssumed() const { return *this; }
+};
+
+using AAAMDMaxNumWorkgroupsState =
+ StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
+
+/// Propagate amdgpu-max-num-workgroups attribute.
+struct AAAMDMaxNumWorkgroups
+ : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
+ using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
+
+ AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+ void initialize(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+ SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
+
+ X.takeKnownMinimum(MaxNumWorkgroups[0]);
+ Y.takeKnownMinimum(MaxNumWorkgroups[1]);
+ Z.takeKnownMinimum(MaxNumWorkgroups[2]);
+
+ if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+ indicatePessimisticFixpoint();
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+ auto CheckCallSite = [&](AbstractCallSite CS) {
+ Function *Caller = CS.getInstruction()->getFunction();
+ LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
+ << "->" << getAssociatedFunction()->getName() << '\n');
+
+ const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
+ *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+ if (!CallerInfo || !CallerInfo->isValidState())
+ return false;
+
+ Change |=
+ clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
+ return true;
+ };
+
+ bool AllCallSitesKnown = true;
+ if (!A.checkForAllCallSites(CheckCallSite, *this,
+ /*RequireAllCallSites=*/true,
+ AllCallSitesKnown))
+ return indicatePessimisticFixpoint();
+
+ return Change;
+ }
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
+ Attributor &A);
+
+ ChangeStatus manifest(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ LLVMContext &Ctx = F->getContext();
+ SmallString<32> Buffer;
+ raw_svector_ostream OS(Buffer);
+ OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
+
+ // TODO: Should annotate loads of the group size for this to do anything
+ // useful.
+ return A.manifestAttrs(
+ getIRPosition(),
+ {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
+ /* ForceReplace= */ true);
+ }
+
+ const std::string getName() const override { return "AAAMDMaxNumWorkgroups"; }
+
+ const std::string getAsStr(Attributor *) const override {
+ std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
+ raw_string_ostream OS(Buffer);
+ OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
+ << ']';
+ return OS.str();
+ }
+
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is
+ /// AAAMDMaxNumWorkgroups
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ void trackStatistics() const override {}
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+
+const char AAAMDMaxNumWorkgroups::ID = 0;
+
+AAAMDMaxNumWorkgroups &
+AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
+ if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+ return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
+ llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
+}
+
/// Propagate amdgpu-waves-per-eu attribute.
struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
@@ -1046,8 +1190,8 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
DenseSet<const char *> Allowed(
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
- &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
- &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
+ &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
+ &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
&AAInstanceInfo::ID});
@@ -1071,6 +1215,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
for (auto *F : Functions) {
A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
+ A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
CallingConv::ID CC = F->getCallingConv();
if (!AMDGPU::isEntryFunctionCC(CC)) {
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
new file mode 100644
index 00000000000000..366432e0fc6cb1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
+
+; External call to avoid inferring argument attributes. This makes the
+; final attribute groups easier to read
+declare void @dummy()
+
+define void @extern_callee() {
+; CHECK-LABEL: define void @extern_callee(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @dummy()
+ ret void
+}
+
+define internal void @callee_1_2_3() {
+; CHECK-LABEL: define internal void @callee_1_2_3(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @dummy()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_1_2_3() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_1_2_3(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT: call void @callee_1_2_3()
+; CHECK-NEXT: call void @extern_callee()
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @callee_1_2_3()
+ call void @extern_callee()
+ call void @dummy()
+ ret void
+}
+
+attributes #0 = {"amdgpu-max-num-workgroups"="1,2,3"}
+
+; -> 100,10,99
+define internal void @callee_merge_100_8_32__16_10_99() {
+; CHECK-LABEL: define internal void @callee_merge_100_8_32__16_10_99(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @dummy()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_100_8_32() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_100_8_32(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99()
+; CHECK-NEXT: ret void
+;
+ call void @callee_merge_100_8_32__16_10_99()
+ ret void
+}
+
+define amdgpu_cs void @amdgpu_cs_100_8_32() #1 {
+; CHECK-LABEL: define amdgpu_cs void @amdgpu_cs_100_8_32(
+; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99()
+; CHECK-NEXT: ret void
+;
+ call void @callee_merge_100_8_32__16_10_99()
+ ret void
+}
+
+attributes #1 = {"amdgpu-max-num-workgroups"="100,8,32"}
+
+define amdgpu_kernel void @kernel_16_10_99() #2 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_16_10_99(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99()
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @callee_merge_100_8_32__16_10_99()
+ call void @dummy()
+ ret void
+}
+
+attributes #2 = {"amdgpu-max-num-workgroups"="16,10,99"}
+
+define internal void @merge_to_worst_case() {
+; CHECK-LABEL: define internal void @merge_to_worst_case(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @dummy()
+ ret void
+}
+
+define internal void @callee_x_worst_case() {
+; CHECK-LABEL: define internal void @callee_x_worst_case(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @dummy()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_x_maximum() #3 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_x_maximum(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT: call void @merge_to_worst_case()
+; CHECK-NEXT: call void @callee_x_worst_case()
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @merge_to_worst_case()
+ call void @callee_x_worst_case()
+ call void @dummy()
+ ret void
+}
+
+attributes #3 = {"amdgpu-max-num-workgroups"="4294967295,1,1"}
+
+define amdgpu_kernel void @kernel_y_maximum() #4 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_y_maximum(
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT: call void @merge_to_worst_case()
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @merge_to_worst_case()
+ call void @dummy()
+ ret void
+}
+
+attributes #4 = {"amdgpu-max-num-workgroups"="1,4294967295,1"}
+
+define amdgpu_kernel void @kernel_z_maximum() #5 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_z_maximum(
+; CHECK-SAME: ) #[[ATTR7:[0-9]+]] {
+; CHECK-NEXT: call void @merge_to_worst_case()
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @merge_to_worst_case()
+ call void @dummy()
+ ret void
+}
+
+attributes #5 = {"amdgpu-max-num-workgroups"="1,1,4294967295"}
+
+; Make sure the attribute isn't lost from the callee.
+define internal void @annotated_callee_from_unannotated_kernel() #6 {
+; CHECK-LABEL: define internal void @annotated_callee_from_unannotated_kernel(
+; CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @dummy()
+ ret void
+}
+
+attributes #6 = {"amdgpu-max-num-workgroups"="42,99,123"}
+
+define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee() {
+; CHECK-LABEL: define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void @annotated_callee_from_unannotated_kernel()
+; CHECK-NEXT: ret void
+;
+ call void @annotated_callee_from_unannotated_kernel()
+ ret void
+}
+
+
+define internal void @annotated_callee_merge_caller() #7 {
+; CHECK-LABEL: define internal void @annotated_callee_merge_caller(
+; CHECK-SAME: ) #[[ATTR9:[0-9]+]] {
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @dummy()
+ ret void
+}
+
+attributes #7 = {"amdgpu-max-num-workgroups"="512,256,1024"}
+
+define amdgpu_kernel void @call_annotated_callee_merge_caller() #8 {
+; CHECK-LABEL: define amdgpu_kernel void @call_annotated_callee_merge_caller(
+; CHECK-SAME: ) #[[ATTR10:[0-9]+]] {
+; CHECK-NEXT: call void @annotated_callee_merge_caller()
+; CHECK-NEXT: ret void
+;
+ call void @annotated_callee_merge_caller()
+ ret void
+}
+
+attributes #8 = {"amdgpu-max-num-workgroups"="256,128,2048"}
+
+define internal void @called_by_explicit_worst_case() {
+; CHECK-LABEL: define internal void @called_by_explicit_worst_case(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void @dummy()
+; CHECK-NEXT: ret void
+;
+ call void @dummy()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_explicit_worst_case() #9 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_explicit_worst_case(
+; CHECK-SAME: ) #[[ATTR11:[0-9]+]] {
+; CHECK-NEXT: call void @called_by_explicit_worst_case()
+; CHECK-NEXT: ret void
+;
+ call void @called_by_explicit_worst_case()
+ ret void
+}
+
+attributes #9 = {"amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295"}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="100,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,8,32" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="16,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="42,99,123" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="256,128,1024" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-max-num-workgroups"="256,128,2048" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+;.
More information about the llvm-commits
mailing list