[llvm] [AMDGPU][Attributor] Add an option to turn on internalization (PR #108420)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 12 22:05:51 PDT 2024
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/108420
>From ce0107f5cb9fda2a223b8015e8ef6f984e3e4201 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 12 Sep 2024 12:21:31 -0400
Subject: [PATCH] [AMDGPU][Attributor] Add an option to turn on internalization
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 1 +
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 29 ++-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 14 +-
.../AMDGPU/indirect-call-internalization.ll | 202 ++++++++++++++++++
4 files changed, 241 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/indirect-call-internalization.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 399aa9c633564c..19b6484d51ed3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -311,6 +311,7 @@ class AMDGPULowerKernelArgumentsPass
struct AMDGPUAttributorOptions {
bool IsClosedWorld = false;
+ bool EnableInternalization = false;
};
class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 687a7339da379d..da4adc3d03a4a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1031,9 +1031,32 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
AMDGPUAttributorOptions Options) {
+ bool Changed = false;
+
+ DenseMap<Function *, Function *> InternalizedMap;
+ if (Options.EnableInternalization) {
+ auto IsCalled = [](Function &F) {
+ for (const User *U : F.users())
+ if (!isa<BlockAddress>(U))
+ return true;
+ return false;
+ };
+
+ SmallPtrSet<Function *, 16> InternalizeFns;
+ for (Function &F : M) {
+ if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()) ||
+ !IsCalled(F) || !Attributor::isInternalizable(F))
+ continue;
+ InternalizeFns.insert(&F);
+ }
+
+ Changed |=
+ Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
+ }
+
SetVector<Function *> Functions;
for (Function &F : M) {
- if (!F.isIntrinsic())
+ if (!F.isIntrinsic() && !InternalizedMap.lookup(&F))
Functions.insert(&F);
}
@@ -1094,8 +1117,8 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
}
}
- ChangeStatus Change = A.run();
- return Change == ChangeStatus::CHANGED;
+ Changed |= (A.run() == ChangeStatus::CHANGED);
+ return Changed;
}
class AMDGPUAttributorLegacy : public ModulePass {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 55d0de59bc49a9..7e258888c3bb7e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -394,6 +394,11 @@ static cl::opt<bool>
cl::desc("Enable AMDGPUAttributorPass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ EnableLTOInternalization("amdgpu-lto-internalization",
+ cl::desc("Enable LTO function internalization."),
+ cl::Hidden, cl::init(false));
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -674,6 +679,8 @@ parseAMDGPUAttributorPassOptions(StringRef Params) {
std::tie(ParamName, Params) = Params.split(';');
if (ParamName == "closed-world") {
Result.IsClosedWorld = true;
+ } else if (ParamName == "internalization") {
+ Result.EnableInternalization = true;
} else {
return make_error<StringError>(
formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName)
@@ -774,8 +781,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(AMDGPUSwLowerLDSPass(*this));
if (EnableLowerModuleLDS)
PM.addPass(AMDGPULowerModuleLDSPass(*this));
- if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0)
- PM.addPass(AMDGPUAttributorPass(*this));
+ if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0) {
+ AMDGPUAttributorOptions Options;
+ Options.EnableInternalization = EnableLTOInternalization;
+ PM.addPass(AMDGPUAttributorPass(*this, Options));
+ }
});
PB.registerRegClassFilterParsingCallback(
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-internalization.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-internalization.ll
new file mode 100644
index 00000000000000..fcb3cd0a3012f4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-internalization.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck --check-prefixes=EXT %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes="amdgpu-attributor<closed-world>" %s | FileCheck --check-prefixes=CW %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes="amdgpu-attributor<internalization>" %s | FileCheck --check-prefixes=INT %s
+
+ at G = global i32 0, align 4
+
+define void @callee0() {
+entry:
+ store i32 0, ptr @G, align 4
+ ret void
+}
+
+define void @callee1() {
+entry:
+ store i32 1, ptr @G, align 4
+ ret void
+}
+
+define void @callee2() {
+entry:
+ store i32 2, ptr @G, align 4
+ ret void
+}
+
+define void @helper(ptr %fn) {
+entry:
+ call void %fn()
+ ret void
+}
+
+define ptr @take_calle2_address() {
+entry:
+ ret ptr @callee2
+}
+
+define amdgpu_kernel void @foo(i1 %val) {
+entry:
+ %fn = select i1 %val, ptr @callee0, ptr @callee1
+ call void @helper(ptr %fn)
+ ret void
+}
+
+; EXT-LABEL: define void @callee0(
+; EXT-SAME: ) #[[ATTR0:[0-9]+]] {
+; EXT-NEXT: [[ENTRY:.*:]]
+; EXT-NEXT: store i32 0, ptr @G, align 4
+; EXT-NEXT: ret void
+;
+;
+; EXT-LABEL: define void @callee1(
+; EXT-SAME: ) #[[ATTR0]] {
+; EXT-NEXT: [[ENTRY:.*:]]
+; EXT-NEXT: store i32 1, ptr @G, align 4
+; EXT-NEXT: ret void
+;
+;
+; EXT-LABEL: define void @callee2(
+; EXT-SAME: ) #[[ATTR0]] {
+; EXT-NEXT: [[ENTRY:.*:]]
+; EXT-NEXT: store i32 2, ptr @G, align 4
+; EXT-NEXT: ret void
+;
+;
+; EXT-LABEL: define void @helper(
+; EXT-SAME: ptr [[FN:%.*]]) #[[ATTR1:[0-9]+]] {
+; EXT-NEXT: [[ENTRY:.*:]]
+; EXT-NEXT: call void [[FN]]()
+; EXT-NEXT: ret void
+;
+;
+; EXT-LABEL: define ptr @take_calle2_address(
+; EXT-SAME: ) #[[ATTR0]] {
+; EXT-NEXT: [[ENTRY:.*:]]
+; EXT-NEXT: ret ptr @callee2
+;
+;
+; EXT-LABEL: define amdgpu_kernel void @foo(
+; EXT-SAME: i1 [[VAL:%.*]]) #[[ATTR2:[0-9]+]] {
+; EXT-NEXT: [[ENTRY:.*:]]
+; EXT-NEXT: [[FN:%.*]] = select i1 [[VAL]], ptr @callee0, ptr @callee1
+; EXT-NEXT: call void @helper(ptr [[FN]])
+; EXT-NEXT: ret void
+;
+;
+; CW-LABEL: define void @callee0(
+; CW-SAME: ) #[[ATTR0:[0-9]+]] {
+; CW-NEXT: [[ENTRY:.*:]]
+; CW-NEXT: store i32 0, ptr @G, align 4
+; CW-NEXT: ret void
+;
+;
+; CW-LABEL: define void @callee1(
+; CW-SAME: ) #[[ATTR0]] {
+; CW-NEXT: [[ENTRY:.*:]]
+; CW-NEXT: store i32 1, ptr @G, align 4
+; CW-NEXT: ret void
+;
+;
+; CW-LABEL: define void @callee2(
+; CW-SAME: ) #[[ATTR0]] {
+; CW-NEXT: [[ENTRY:.*:]]
+; CW-NEXT: store i32 2, ptr @G, align 4
+; CW-NEXT: ret void
+;
+;
+; CW-LABEL: define void @helper(
+; CW-SAME: ptr [[FN:%.*]]) #[[ATTR1:[0-9]+]] {
+; CW-NEXT: [[ENTRY:.*:]]
+; CW-NEXT: [[TMP0:%.*]] = icmp eq ptr [[FN]], @callee0
+; CW-NEXT: br i1 [[TMP0]], label %[[BB1:.*]], label %[[BB2:.*]]
+; CW: [[BB1]]:
+; CW-NEXT: call void @callee0()
+; CW-NEXT: br label %[[BB8:.*]]
+; CW: [[BB2]]:
+; CW-NEXT: [[TMP3:%.*]] = icmp eq ptr [[FN]], @callee1
+; CW-NEXT: br i1 [[TMP3]], label %[[BB4:.*]], label %[[BB5:.*]]
+; CW: [[BB4]]:
+; CW-NEXT: call void @callee1()
+; CW-NEXT: br label %[[BB8]]
+; CW: [[BB5]]:
+; CW-NEXT: br i1 true, label %[[BB6:.*]], label %[[BB7:.*]]
+; CW: [[BB6]]:
+; CW-NEXT: call void @callee2()
+; CW-NEXT: br label %[[BB8]]
+; CW: [[BB7]]:
+; CW-NEXT: unreachable
+; CW: [[BB8]]:
+; CW-NEXT: ret void
+;
+;
+; CW-LABEL: define ptr @take_calle2_address(
+; CW-SAME: ) #[[ATTR0]] {
+; CW-NEXT: [[ENTRY:.*:]]
+; CW-NEXT: ret ptr @callee2
+;
+;
+; CW-LABEL: define amdgpu_kernel void @foo(
+; CW-SAME: i1 [[VAL:%.*]]) #[[ATTR2:[0-9]+]] {
+; CW-NEXT: [[ENTRY:.*:]]
+; CW-NEXT: [[FN:%.*]] = select i1 [[VAL]], ptr @callee0, ptr @callee1
+; CW-NEXT: call void @helper(ptr [[FN]])
+; CW-NEXT: ret void
+;
+;
+; INT-LABEL: define void @callee0() {
+; INT-NEXT: [[ENTRY:.*:]]
+; INT-NEXT: store i32 0, ptr @G, align 4
+; INT-NEXT: ret void
+;
+;
+; INT-LABEL: define void @callee1() {
+; INT-NEXT: [[ENTRY:.*:]]
+; INT-NEXT: store i32 1, ptr @G, align 4
+; INT-NEXT: ret void
+;
+;
+; INT-LABEL: define void @callee2() {
+; INT-NEXT: [[ENTRY:.*:]]
+; INT-NEXT: store i32 2, ptr @G, align 4
+; INT-NEXT: ret void
+;
+;
+; INT-LABEL: define private void @helper.internalized(
+; INT-SAME: ptr [[FN:%.*]]) #[[ATTR0:[0-9]+]] {
+; INT-NEXT: [[ENTRY:.*:]]
+; INT-NEXT: [[TMP0:%.*]] = icmp eq ptr [[FN]], @callee1
+; INT-NEXT: br i1 [[TMP0]], label %[[BB1:.*]], label %[[BB2:.*]]
+; INT: [[BB1]]:
+; INT-NEXT: call void @callee1()
+; INT-NEXT: br label %[[BB5:.*]]
+; INT: [[BB2]]:
+; INT-NEXT: br i1 true, label %[[BB3:.*]], label %[[BB4:.*]]
+; INT: [[BB3]]:
+; INT-NEXT: call void @callee0()
+; INT-NEXT: br label %[[BB5]]
+; INT: [[BB4]]:
+; INT-NEXT: unreachable
+; INT: [[BB5]]:
+; INT-NEXT: ret void
+;
+;
+; INT-LABEL: define void @helper(
+; INT-SAME: ptr [[FN:%.*]]) {
+; INT-NEXT: [[ENTRY:.*:]]
+; INT-NEXT: call void [[FN]]()
+; INT-NEXT: ret void
+;
+;
+; INT-LABEL: define ptr @take_calle2_address(
+; INT-SAME: ) #[[ATTR1:[0-9]+]] {
+; INT-NEXT: [[ENTRY:.*:]]
+; INT-NEXT: ret ptr @callee2
+;
+;
+; INT-LABEL: define amdgpu_kernel void @foo(
+; INT-SAME: i1 [[VAL:%.*]]) #[[ATTR0]] {
+; INT-NEXT: [[ENTRY:.*:]]
+; INT-NEXT: [[FN:%.*]] = select i1 [[VAL]], ptr @callee0, ptr @callee1
+; INT-NEXT: call void @helper.internalized(ptr [[FN]])
+; INT-NEXT: ret void
+;
More information about the llvm-commits
mailing list