[llvm] [AMDGPU][Attributor] Infer `inreg` attribute in `AMDGPUAttributor` (PR #101609)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 3 10:24:32 PDT 2024
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/101609
>From 7b2599d2a6e5e4ce8b34d41dc30b8a42aec65bf7 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 1 Aug 2024 23:39:05 -0400
Subject: [PATCH] [WIP][AMDGPU][Attributor] Infer `inreg` attribute in
`AMDGPUAttributor`
---
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 103 +++++++++++++++++-
...butor-accesslist-offsetbins-out-of-sync.ll | 2 +-
llvm/test/CodeGen/AMDGPU/inreg-inference.ll | 97 +++++++++++++++++
.../AMDGPU/remove-no-kernel-id-attribute.ll | 4 +-
4 files changed, 202 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/inreg-inference.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 72049f0aa6b86e..860a936ffe06d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -14,6 +14,7 @@
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CycleAnalysis.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
@@ -1014,6 +1015,100 @@ struct AAAMDGPUNoAGPR
const char AAAMDGPUNoAGPR::ID = 0;
+struct AAAMDGPUInreg
+ : public IRAttribute<Attribute::InReg,
+ StateWrapper<BooleanState, AbstractAttribute>,
+ AAAMDGPUInreg> {
+ AAAMDGPUInreg(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDGPUInreg &createForPosition(const IRPosition &IRP, Attributor &A);
+
+ /// See AbstractAttribute::getName()
+ const std::string getName() const override { return "AAAMDGPUInreg"; }
+
+ const std::string getAsStr(Attributor *A) const override {
+ return getAssumed() ? "inreg" : "non-inreg";
+ }
+
+ void trackStatistics() const override {}
+
+ /// See AbstractAttribute::getIdAddr()
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is AAAMDGPUInreg
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+
+const char AAAMDGPUInreg::ID = 0;
+
+namespace {
+
+struct AAAMDGPUInregArgument : public AAAMDGPUInreg {
+ AAAMDGPUInregArgument(const IRPosition &IRP, Attributor &A)
+ : AAAMDGPUInreg(IRP, A) {}
+
+ void initialize(Attributor &A) override {
+ assert(getAssociatedFunction()->getCallingConv() !=
+ CallingConv::AMDGPU_KERNEL);
+ if (getAssociatedArgument()->hasAttribute(Attribute::InReg))
+ indicateOptimisticFixpoint();
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ unsigned ArgNo = getAssociatedArgument()->getArgNo();
+
+ auto isUniform = [&](AbstractCallSite ACS) -> bool {
+ CallBase *CB = ACS.getInstruction();
+ Value *V = CB->getArgOperandUse(ArgNo);
+ if (isa<Constant>(V))
+ return true;
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ auto *AU = A.getInfoCache()
+ .getAnalysisResultForFunction<UniformityInfoAnalysis>(
+ *I->getFunction());
+ return AU && AU->isUniform(I);
+ }
+ if (auto *Arg = dyn_cast<Argument>(V)) {
+ // We assume all kernel arguments are uniform.
+ if (Arg->getParent()->getCallingConv() == CallingConv::AMDGPU_KERNEL)
+ return true;
+ auto *AA =
+ A.getOrCreateAAFor<AAAMDGPUInreg>(IRPosition::argument(*Arg));
+ return AA && AA->isValidState();
+ }
+ return false;
+ };
+
+ bool UsedAssumedInformation = false;
+ if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true,
+ UsedAssumedInformation))
+ return indicatePessimisticFixpoint();
+
+ if (!UsedAssumedInformation)
+ return indicateOptimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+ }
+};
+
+} // namespace
+
+AAAMDGPUInreg &AAAMDGPUInreg::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ switch (IRP.getPositionKind()) {
+ case IRPosition::IRP_ARGUMENT:
+ return *new (A.Allocator) AAAMDGPUInregArgument(IRP, A);
+ default:
+ llvm_unreachable("not a valid position for AAAMDGPUInreg");
+ }
+}
+
static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
for (unsigned I = 0;
@@ -1046,7 +1141,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
&AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
- &AAInstanceInfo::ID});
+ &AAInstanceInfo::ID, &AAAMDGPUInreg::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1090,6 +1185,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
IRPosition::value(*SI->getPointerOperand()));
}
}
+
+ if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) {
+ for (auto &Arg : F.args())
+ A.getOrCreateAAFor<AAAMDGPUInreg>(IRPosition::argument(Arg));
+ }
}
ChangeStatus Change = A.run();
@@ -1118,6 +1218,7 @@ class AMDGPUAttributorLegacy : public ModulePass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<CycleInfoWrapperPass>();
+ AU.addRequired<UniformityInfoWrapperPass>();
}
StringRef getPassName() const override { return "AMDGPU Attributor"; }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
index d58a62408427dc..4f46e08921a49b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
@@ -8,7 +8,7 @@
define internal fastcc void @foo(ptr %kg) {
; CHECK-LABEL: define internal fastcc void @foo(
-; CHECK-SAME: ptr [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr inreg [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CLOSURE_I25_I:%.*]] = getelementptr i8, ptr [[KG]], i64 336
; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276
diff --git a/llvm/test/CodeGen/AMDGPU/inreg-inference.ll b/llvm/test/CodeGen/AMDGPU/inreg-inference.ll
new file mode 100644
index 00000000000000..531bc1f78073c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inreg-inference.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor %s -o - | FileCheck %s
+
+ at g1 = protected addrspace(1) externally_initialized global i32 0, align 4
+ at g2 = protected addrspace(1) externally_initialized global i32 0, align 4
+ at g3 = protected addrspace(1) externally_initialized global i32 0, align 4
+ at g4 = protected addrspace(1) externally_initialized global i32 0, align 4
+
+;.
+; CHECK: @g1 = protected addrspace(1) externally_initialized global i32 0, align 4
+; CHECK: @g2 = protected addrspace(1) externally_initialized global i32 0, align 4
+; CHECK: @g3 = protected addrspace(1) externally_initialized global i32 0, align 4
+; CHECK: @g4 = protected addrspace(1) externally_initialized global i32 0, align 4
+;.
+define internal fastcc void @callee_infer(ptr addrspace(1) %x, i32 %y) {
+; CHECK-LABEL: define {{[^@]+}}@callee_infer
+; CHECK-SAME: (ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
+; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %x.val = load i32, ptr addrspace(1) %x, align 4
+ store i32 %x.val, ptr addrspace(1) @g3, align 4
+ store i32 %y, ptr addrspace(1) @g4, align 4
+ ret void
+}
+
+define protected amdgpu_kernel void @kernel_infer(ptr addrspace(1) %p1, ptr addrspace(1) %p2, i32 %x) {
+; CHECK-LABEL: define {{[^@]+}}@kernel_infer
+; CHECK-SAME: (ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]]
+; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 [[X]])
+; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 [[X]])
+; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 1)
+; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 2)
+; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) [[P]], i32 [[X]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp = icmp sgt i32 %x, 0
+ %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2
+ tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 %x)
+ tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 %x)
+ tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 1)
+ tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 2)
+ tail call fastcc void @callee_infer(ptr addrspace(1) %p, i32 %x)
+ ret void
+}
+
+define internal fastcc void @callee_not_infer(ptr addrspace(1) %x, i32 %y) {
+; CHECK-LABEL: define {{[^@]+}}@callee_not_infer
+; CHECK-SAME: (ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
+; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
+; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %x.val = load i32, ptr addrspace(1) %x, align 4
+ store i32 %x.val, ptr addrspace(1) @g3, align 4
+ store i32 %y, ptr addrspace(1) @g4, align 4
+ ret void
+}
+
+define protected amdgpu_kernel void @kernel_not_infer(ptr addrspace(1) %q, ptr addrspace(1) %p1, ptr addrspace(1) %p2) {
+; CHECK-LABEL: define {{[^@]+}}@kernel_not_infer
+; CHECK-SAME: (ptr addrspace(1) [[Q:%.*]], ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[Q]], i32 [[ID_X]]
+; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D]], 0
+; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]]
+; CHECK-NEXT: tail call fastcc void @callee_not_infer(ptr addrspace(1) [[Q]], i32 [[ID_X]])
+; CHECK-NEXT: tail call fastcc void @callee_not_infer(ptr addrspace(1) [[P]], i32 [[ID_X]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, ptr addrspace(1) %q, i32 %id.x
+ %d = load i32, ptr addrspace(1) %gep
+ %cmp = icmp sgt i32 %d, 0
+ %p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2
+ tail call fastcc void @callee_not_infer(ptr addrspace(1) %q, i32 %id.x)
+ tail call fastcc void @callee_not_infer(ptr addrspace(1) %p, i32 %id.x)
+ ret void
+}
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 4420833029d466..474e6f87a3e14c 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -148,7 +148,7 @@ define amdgpu_kernel void @kernel_lds() {
define internal i16 @mutual_recursion_0(i16 %arg) {
; CHECK-LABEL: define internal i16 @mutual_recursion_0(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 inreg [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
; CHECK-NEXT: [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4
@@ -168,7 +168,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) {
define internal void @mutual_recursion_1(i16 %arg) {
; CHECK-LABEL: define internal void @mutual_recursion_1(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 inreg [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @mutual_recursion_0(i16 [[ARG]])
; CHECK-NEXT: ret void
;
More information about the llvm-commits
mailing list