[llvm] [AMDGPU][Attributor] Infer inreg attribute in `AMDGPUAttributor` (PR #146720)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 21 09:50:52 PDT 2025
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/146720
>From d55e8aeda7b6063818d63438490dadd644bd2688 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 2 Jul 2025 11:14:59 -0400
Subject: [PATCH 1/2] [AMDGPU][Attributor] Infer inreg attribute in
`AMDGPUAttributor`
This patch introduces `AAAMDGPUUniformArgument` that can infer `inreg` function
argument attribute. The idea is, for a function argument, if the corresponding
call site arguments are always uniform, we can mark it as `inreg` thus pass it
via SGPR.
In addition, this AA is also able to propagate the inreg attribute if feasible.
---
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 116 +++++++++++++++++-
.../test/CodeGen/AMDGPU/aa-inreg-inference.ll | 74 +++++++++++
2 files changed, 189 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index dedee46a44237..9acdd21a4d81a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,6 +13,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
@@ -1297,6 +1298,114 @@ struct AAAMDGPUNoAGPR
const char AAAMDGPUNoAGPR::ID = 0;
+struct AAAMDGPUUniform : public StateWrapper<BooleanState, AbstractAttribute> {
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
+ AAAMDGPUUniform(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDGPUUniform &createForPosition(const IRPosition &IRP,
+ Attributor &A);
+
+ /// See AbstractAttribute::getName()
+ StringRef getName() const override { return "AAAMDGPUUniform"; }
+
+ const std::string getAsStr(Attributor *A) const override {
+ return getAssumed() ? "uniform" : "divergent";
+ }
+
+ void trackStatistics() const override {}
+
+ /// See AbstractAttribute::getIdAddr()
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is
+ /// AAAMDGPUUniform
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+
+const char AAAMDGPUUniform::ID = 0;
+
+/// This AA is to infer the inreg attribute for a function argument.
+struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
+ AAAMDGPUUniformArgument(const IRPosition &IRP, Attributor &A)
+ : AAAMDGPUUniform(IRP, A) {}
+
+ void initialize(Attributor &A) override {
+ Argument *Arg = getAssociatedArgument();
+ CallingConv::ID CC = Arg->getParent()->getCallingConv();
+ if (Arg->hasAttribute(Attribute::InReg)) {
+ indicateOptimisticFixpoint();
+ return;
+ }
+
+ if (AMDGPU::isEntryFunctionCC(CC)) {
+ // We only use isArgPassedInSGPR on kernel entry function argument, so
+ // even if we will use SPGR for non-uniform i1 argument passing, it will
+ // not affect this.
+ if (AMDGPU::isArgPassedInSGPR(Arg))
+ indicateOptimisticFixpoint();
+ else
+ indicatePessimisticFixpoint();
+ }
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ unsigned ArgNo = getAssociatedArgument()->getArgNo();
+ TargetMachine &TM =
+ static_cast<AMDGPUInformationCache &>(A.getInfoCache()).TM;
+
+ auto isUniform = [&](AbstractCallSite ACS) -> bool {
+ CallBase *CB = ACS.getInstruction();
+ Value *V = CB->getArgOperand(ArgNo);
+ if (auto *Arg = dyn_cast<Argument>(V)) {
+ auto *AA = A.getOrCreateAAFor<AAAMDGPUUniform>(
+ IRPosition::argument(*Arg), this, DepClassTy::REQUIRED);
+ return AA && AA->isValidState();
+ }
+ TargetTransformInfo TTI = TM.getTargetTransformInfo(*CB->getFunction());
+ return TTI.isAlwaysUniform(V);
+ };
+
+ bool UsedAssumedInformation = true;
+ if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true,
+ UsedAssumedInformation))
+ return indicatePessimisticFixpoint();
+
+ if (!UsedAssumedInformation)
+ return indicateOptimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ Argument *Arg = getAssociatedArgument();
+ // If the argument already has inreg attribute, we will not do anything
+ // about it.
+ if (Arg->hasAttribute(Attribute::InReg))
+ return ChangeStatus::UNCHANGED;
+ if (AMDGPU::isEntryFunctionCC(Arg->getParent()->getCallingConv()))
+ return ChangeStatus::UNCHANGED;
+ LLVMContext &Ctx = Arg->getContext();
+ return A.manifestAttrs(getIRPosition(),
+ {Attribute::get(Ctx, Attribute::InReg)});
+ }
+};
+
+AAAMDGPUUniform &AAAMDGPUUniform::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ switch (IRP.getPositionKind()) {
+ case IRPosition::IRP_ARGUMENT:
+ return *new (A.Allocator) AAAMDGPUUniformArgument(IRP, A);
+ default:
+ llvm_unreachable("not a valid position for AAAMDGPUUniform");
+ }
+}
+
/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
/// Both attributes start with narrow ranges that expand during iteration.
@@ -1383,7 +1492,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
- &AAIndirectCallInfo::ID, &AAInstanceInfo::ID});
+ &AAIndirectCallInfo::ID, &AAInstanceInfo::ID, &AAAMDGPUUniform::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1436,6 +1545,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
}
+
+ if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+ for (auto &Arg : F->args())
+ A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(Arg));
+ }
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll b/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll
new file mode 100644
index 0000000000000..91dc5618b2989
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o - | FileCheck %s
+
+ at g1 = protected addrspace(1) externally_initialized global i32 0, align 4
+ at g2 = protected addrspace(1) externally_initialized global i32 0, align 4
+ at g3 = protected addrspace(1) externally_initialized global i32 0, align 4
+ at g4 = protected addrspace(1) externally_initialized global i32 0, align 4
+
+define internal void @callee_with_always_uniform_argument(ptr addrspace(1) %x, i32 %y) {
+; CHECK-LABEL: define internal void @callee_with_always_uniform_argument(
+; CHECK-SAME: ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
+; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %x.val = load i32, ptr addrspace(1) %x, align 4
+ store i32 %x.val, ptr addrspace(1) @g3, align 4
+ store i32 %y, ptr addrspace(1) @g4, align 4
+ ret void
+}
+
+define amdgpu_kernel void @kernel_with_readfirstlane(ptr addrspace(1) %p, i32 %x) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_with_readfirstlane(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[P0:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) [[P]])
+; CHECK-NEXT: call void @callee_with_always_uniform_argument(ptr addrspace(1) [[P0]], i32 [[X]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %p0 = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) %p)
+ call void @callee_with_always_uniform_argument(ptr addrspace(1) %p0, i32 %x)
+ ret void
+}
+
+define internal void @callee_without_always_uniform_argument(ptr addrspace(1) %x, i32 %y) {
+; CHECK-LABEL: define internal void @callee_without_always_uniform_argument(
+; CHECK-SAME: ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
+; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %x.val = load i32, ptr addrspace(1) %x, align 4
+ store i32 %x.val, ptr addrspace(1) @g3, align 4
+ store i32 %y, ptr addrspace(1) @g4, align 4
+ ret void
+}
+
+define amdgpu_kernel void @kernel_with_divergent_callsite_argument(ptr addrspace(1) %p, i32 %x) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_with_divergent_callsite_argument(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[P]], i32 [[ID_X]]
+; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
+; CHECK-NEXT: call void @callee_without_always_uniform_argument(ptr addrspace(1) [[GEP]], i32 [[D]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, ptr addrspace(1) %p, i32 %id.x
+ %d = load i32, ptr addrspace(1) %gep
+ call void @callee_without_always_uniform_argument(ptr addrspace(1) %gep, i32 %d)
+ ret void
+}
+
+declare ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1))
+declare noundef i32 @llvm.amdgcn.workitem.id.x()
>From 8533188feff8985dc8b900687c8f81675eedd69e Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Mon, 21 Jul 2025 09:59:55 -0400
Subject: [PATCH 2/2] fix comments
---
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 10 ++++---
.../test/CodeGen/AMDGPU/aa-inreg-inference.ll | 29 +++++++++++++------
.../AMDGPU/attributor-noalias-addrspace.ll | 4 +--
3 files changed, 28 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9acdd21a4d81a..b327d06cce84b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1356,19 +1356,21 @@ struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
ChangeStatus updateImpl(Attributor &A) override {
unsigned ArgNo = getAssociatedArgument()->getArgNo();
- TargetMachine &TM =
- static_cast<AMDGPUInformationCache &>(A.getInfoCache()).TM;
auto isUniform = [&](AbstractCallSite ACS) -> bool {
CallBase *CB = ACS.getInstruction();
Value *V = CB->getArgOperand(ArgNo);
+ if (isa<Constant>(V))
+ return true;
if (auto *Arg = dyn_cast<Argument>(V)) {
auto *AA = A.getOrCreateAAFor<AAAMDGPUUniform>(
IRPosition::argument(*Arg), this, DepClassTy::REQUIRED);
return AA && AA->isValidState();
}
- TargetTransformInfo TTI = TM.getTargetTransformInfo(*CB->getFunction());
- return TTI.isAlwaysUniform(V);
+ TargetTransformInfo *TTI =
+ A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(
+ *CB->getFunction());
+ return TTI->isAlwaysUniform(V);
};
bool UsedAssumedInformation = true;
diff --git a/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll b/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll
index 91dc5618b2989..22cfd4827e5da 100644
--- a/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll
+++ b/llvm/test/CodeGen/AMDGPU/aa-inreg-inference.ll
@@ -4,21 +4,20 @@
@g1 = protected addrspace(1) externally_initialized global i32 0, align 4
@g2 = protected addrspace(1) externally_initialized global i32 0, align 4
@g3 = protected addrspace(1) externally_initialized global i32 0, align 4
- at g4 = protected addrspace(1) externally_initialized global i32 0, align 4
define internal void @callee_with_always_uniform_argument(ptr addrspace(1) %x, i32 %y) {
; CHECK-LABEL: define internal void @callee_with_always_uniform_argument(
; CHECK-SAME: ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
-; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
-; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g2, align 4
+; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g3, align 4
; CHECK-NEXT: ret void
;
entry:
%x.val = load i32, ptr addrspace(1) %x, align 4
- store i32 %x.val, ptr addrspace(1) @g3, align 4
- store i32 %y, ptr addrspace(1) @g4, align 4
+ store i32 %x.val, ptr addrspace(1) @g2, align 4
+ store i32 %y, ptr addrspace(1) @g3, align 4
ret void
}
@@ -36,19 +35,31 @@ entry:
ret void
}
+define amdgpu_kernel void @kernel_with_constant(i32 %x) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_with_constant(
+; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @callee_with_always_uniform_argument(ptr addrspace(1) @g1, i32 [[X]])
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @callee_with_always_uniform_argument(ptr addrspace(1) @g1, i32 %x)
+ ret void
+}
+
define internal void @callee_without_always_uniform_argument(ptr addrspace(1) %x, i32 %y) {
; CHECK-LABEL: define internal void @callee_without_always_uniform_argument(
; CHECK-SAME: ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
-; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
-; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g2, align 4
+; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g3, align 4
; CHECK-NEXT: ret void
;
entry:
%x.val = load i32, ptr addrspace(1) %x, align 4
- store i32 %x.val, ptr addrspace(1) @g3, align 4
- store i32 %y, ptr addrspace(1) @g4, align 4
+ store i32 %x.val, ptr addrspace(1) @g2, align 4
+ store i32 %y, ptr addrspace(1) @g3, align 4
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
index d91b2117c7ad9..d4e213fecddf8 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
@@ -480,7 +480,7 @@ bb.2.end:
define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 {
; CHECK-LABEL: define internal void @callee_no_alias_addr_space_select(
-; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 inreg [[COND1:%.*]], i1 inreg [[COND2:%.*]], i32 inreg [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[PTR4:%.*]] = select i1 [[COND1]], ptr addrspacecast (ptr addrspace(1) @gptr to ptr), ptr addrspacecast (ptr addrspace(4) @gptr2 to ptr)
; CHECK-NEXT: [[PTR5:%.*]] = select i1 [[COND2]], ptr [[PTR4]], ptr addrspacecast (ptr addrspace(3) @gptr3 to ptr)
; CHECK-NEXT: store i32 [[VAL]], ptr [[PTR5]], align 4, !noalias.addrspace [[META1:![0-9]+]]
@@ -516,7 +516,7 @@ define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, pt
define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 {
; CHECK-LABEL: define internal void @callee_alias_addr_space_branch(
-; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 inreg [[COND1:%.*]], i1 inreg [[COND2:%.*]], i32 inreg [[VAL:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: br i1 [[COND1]], label %[[BB_1_TRUE:.*]], label %[[BB_1_FALSE:.*]]
; CHECK: [[BB_1_TRUE]]:
; CHECK-NEXT: br label %[[BB_1_END:.*]]
More information about the llvm-commits
mailing list