[llvm] [AMDGPU][Attributor] Add an option to turn on internalization (PR #108420)

Thu Sep 12 22:06:35 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)

<details>
<summary>Changes</summary>

This PR adds an opt-in feature to internalize the module before running `AMDGPUAttributor` in LTO pipeline.

The closed-world assumption is not always true for AMDGPU, even at LTO post-link stage, because AMDGPU does support relocatable linking. As a consequence, we can't assume we have a closed-world view at IR level. In this case, if we have an indirect function call inside an externally visible function, whose argument determines the potential callee, we can't simply assume the potential callee would for sure come from the module. Even if we force the closed-world assumption at our own discretion, this doesn't make the situation better because `AAPotentialValues` can't work as expected if a function is externally visible. The closed-world assumption can only deem all functions whose addresses are taken as potential callees.

As shown in the newly added test case, there is an indirect function call inside `@helper`, whose only argument is used as the function pointer. There are three functions whoese address are taken in the module. Without closed-world assumption, we can't optimize the indirect call. With closed-world assumption, all the three functions are taken as potential callee, and we can't deduce it further because `AAPotentialValues` on `%fn` can't work as expected due to the external visibility of `@helper`. With internalization, an internalized version of `@helper` is created, and then everything pans out, thus there are only two potential callees.

Since this is an opt-in feature, it doesn't change any behavior by default, and whoever uses it is at their own discretion.

---
Full diff: https://github.com/llvm/llvm-project/pull/108420.diff


4 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (+26-3) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+12-2) 
- (added) llvm/test/CodeGen/AMDGPU/indirect-call-internalization.ll (+202) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 399aa9c633564c..19b6484d51ed3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -311,6 +311,7 @@ class AMDGPULowerKernelArgumentsPass
 
 struct AMDGPUAttributorOptions {
   bool IsClosedWorld = false;
+  bool EnableInternalization = false;
 };
 
 class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 687a7339da379d..da4adc3d03a4a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1031,9 +1031,32 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
 
 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
                     AMDGPUAttributorOptions Options) {
+  bool Changed = false;
+
+  DenseMap<Function *, Function *> InternalizedMap;
+  if (Options.EnableInternalization) {
+    auto IsCalled = [](Function &F) {
+      for (const User *U : F.users())
+        if (!isa<BlockAddress>(U))
+          return true;
+      return false;
+    };
+
+    SmallPtrSet<Function *, 16> InternalizeFns;
+    for (Function &F : M) {
+      if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()) ||
+          !IsCalled(F) || !Attributor::isInternalizable(F))
+        continue;
+      InternalizeFns.insert(&F);
+    }
+
+    Changed |=
+        Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
+  }
+
   SetVector<Function *> Functions;
   for (Function &F : M) {
-    if (!F.isIntrinsic())
+    if (!F.isIntrinsic() && !InternalizedMap.lookup(&F))
       Functions.insert(&F);
   }
 
@@ -1094,8 +1117,8 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
     }
   }
 
-  ChangeStatus Change = A.run();
-  return Change == ChangeStatus::CHANGED;
+  Changed |= (A.run() == ChangeStatus::CHANGED);
+  return Changed;
 }
 
 class AMDGPUAttributorLegacy : public ModulePass {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 55d0de59bc49a9..7e258888c3bb7e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -394,6 +394,11 @@ static cl::opt<bool>
                            cl::desc("Enable AMDGPUAttributorPass"),
                            cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+    EnableLTOInternalization("amdgpu-lto-internalization",
+                             cl::desc("Enable LTO function internalization."),
+                             cl::Hidden, cl::init(false));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -674,6 +679,8 @@ parseAMDGPUAttributorPassOptions(StringRef Params) {
     std::tie(ParamName, Params) = Params.split(';');
     if (ParamName == "closed-world") {
       Result.IsClosedWorld = true;
+    } else if (ParamName == "internalization") {
+      Result.EnableInternalization = true;
     } else {
       return make_error<StringError>(
           formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName)
@@ -774,8 +781,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
           PM.addPass(AMDGPUSwLowerLDSPass(*this));
         if (EnableLowerModuleLDS)
           PM.addPass(AMDGPULowerModuleLDSPass(*this));
-        if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0)
-          PM.addPass(AMDGPUAttributorPass(*this));
+        if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0) {
+          AMDGPUAttributorOptions Options;
+          Options.EnableInternalization = EnableLTOInternalization;
+          PM.addPass(AMDGPUAttributorPass(*this, Options));
+        }
       });
 
   PB.registerRegClassFilterParsingCallback(
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-internalization.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-internalization.ll
new file mode 100644
index 00000000000000..fcb3cd0a3012f4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-internalization.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck --check-prefixes=EXT %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes="amdgpu-attributor<closed-world>" %s | FileCheck --check-prefixes=CW %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes="amdgpu-attributor<internalization>" %s | FileCheck --check-prefixes=INT %s
+
+ at G = global i32 0, align 4
+
+define void @callee0() {
+entry:
+  store i32 0, ptr @G, align 4
+  ret void
+}
+
+define void @callee1() {
+entry:
+  store i32 1, ptr @G, align 4
+  ret void
+}
+
+define void @callee2() {
+entry:
+  store i32 2, ptr @G, align 4
+  ret void
+}
+
+define void @helper(ptr %fn) {
+entry:
+  call void %fn()
+  ret void
+}
+
+define ptr @take_calle2_address() {
+entry:
+  ret ptr @callee2
+}
+
+define amdgpu_kernel void @foo(i1 %val) {
+entry:
+  %fn = select i1 %val, ptr @callee0, ptr @callee1
+  call void @helper(ptr %fn)
+  ret void
+}
+
+; EXT-LABEL: define void @callee0(
+; EXT-SAME: ) #[[ATTR0:[0-9]+]] {
+; EXT-NEXT:  [[ENTRY:.*:]]
+; EXT-NEXT:    store i32 0, ptr @G, align 4
+; EXT-NEXT:    ret void
+;
+;
+; EXT-LABEL: define void @callee1(
+; EXT-SAME: ) #[[ATTR0]] {
+; EXT-NEXT:  [[ENTRY:.*:]]
+; EXT-NEXT:    store i32 1, ptr @G, align 4
+; EXT-NEXT:    ret void
+;
+;
+; EXT-LABEL: define void @callee2(
+; EXT-SAME: ) #[[ATTR0]] {
+; EXT-NEXT:  [[ENTRY:.*:]]
+; EXT-NEXT:    store i32 2, ptr @G, align 4
+; EXT-NEXT:    ret void
+;
+;
+; EXT-LABEL: define void @helper(
+; EXT-SAME: ptr [[FN:%.*]]) #[[ATTR1:[0-9]+]] {
+; EXT-NEXT:  [[ENTRY:.*:]]
+; EXT-NEXT:    call void [[FN]]()
+; EXT-NEXT:    ret void
+;
+;
+; EXT-LABEL: define ptr @take_calle2_address(
+; EXT-SAME: ) #[[ATTR0]] {
+; EXT-NEXT:  [[ENTRY:.*:]]
+; EXT-NEXT:    ret ptr @callee2
+;
+;
+; EXT-LABEL: define amdgpu_kernel void @foo(
+; EXT-SAME: i1 [[VAL:%.*]]) #[[ATTR2:[0-9]+]] {
+; EXT-NEXT:  [[ENTRY:.*:]]
+; EXT-NEXT:    [[FN:%.*]] = select i1 [[VAL]], ptr @callee0, ptr @callee1
+; EXT-NEXT:    call void @helper(ptr [[FN]])
+; EXT-NEXT:    ret void
+;
+;
+; CW-LABEL: define void @callee0(
+; CW-SAME: ) #[[ATTR0:[0-9]+]] {
+; CW-NEXT:  [[ENTRY:.*:]]
+; CW-NEXT:    store i32 0, ptr @G, align 4
+; CW-NEXT:    ret void
+;
+;
+; CW-LABEL: define void @callee1(
+; CW-SAME: ) #[[ATTR0]] {
+; CW-NEXT:  [[ENTRY:.*:]]
+; CW-NEXT:    store i32 1, ptr @G, align 4
+; CW-NEXT:    ret void
+;
+;
+; CW-LABEL: define void @callee2(
+; CW-SAME: ) #[[ATTR0]] {
+; CW-NEXT:  [[ENTRY:.*:]]
+; CW-NEXT:    store i32 2, ptr @G, align 4
+; CW-NEXT:    ret void
+;
+;
+; CW-LABEL: define void @helper(
+; CW-SAME: ptr [[FN:%.*]]) #[[ATTR1:[0-9]+]] {
+; CW-NEXT:  [[ENTRY:.*:]]
+; CW-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[FN]], @callee0
+; CW-NEXT:    br i1 [[TMP0]], label %[[BB1:.*]], label %[[BB2:.*]]
+; CW:       [[BB1]]:
+; CW-NEXT:    call void @callee0()
+; CW-NEXT:    br label %[[BB8:.*]]
+; CW:       [[BB2]]:
+; CW-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[FN]], @callee1
+; CW-NEXT:    br i1 [[TMP3]], label %[[BB4:.*]], label %[[BB5:.*]]
+; CW:       [[BB4]]:
+; CW-NEXT:    call void @callee1()
+; CW-NEXT:    br label %[[BB8]]
+; CW:       [[BB5]]:
+; CW-NEXT:    br i1 true, label %[[BB6:.*]], label %[[BB7:.*]]
+; CW:       [[BB6]]:
+; CW-NEXT:    call void @callee2()
+; CW-NEXT:    br label %[[BB8]]
+; CW:       [[BB7]]:
+; CW-NEXT:    unreachable
+; CW:       [[BB8]]:
+; CW-NEXT:    ret void
+;
+;
+; CW-LABEL: define ptr @take_calle2_address(
+; CW-SAME: ) #[[ATTR0]] {
+; CW-NEXT:  [[ENTRY:.*:]]
+; CW-NEXT:    ret ptr @callee2
+;
+;
+; CW-LABEL: define amdgpu_kernel void @foo(
+; CW-SAME: i1 [[VAL:%.*]]) #[[ATTR2:[0-9]+]] {
+; CW-NEXT:  [[ENTRY:.*:]]
+; CW-NEXT:    [[FN:%.*]] = select i1 [[VAL]], ptr @callee0, ptr @callee1
+; CW-NEXT:    call void @helper(ptr [[FN]])
+; CW-NEXT:    ret void
+;
+;
+; INT-LABEL: define void @callee0() {
+; INT-NEXT:  [[ENTRY:.*:]]
+; INT-NEXT:    store i32 0, ptr @G, align 4
+; INT-NEXT:    ret void
+;
+;
+; INT-LABEL: define void @callee1() {
+; INT-NEXT:  [[ENTRY:.*:]]
+; INT-NEXT:    store i32 1, ptr @G, align 4
+; INT-NEXT:    ret void
+;
+;
+; INT-LABEL: define void @callee2() {
+; INT-NEXT:  [[ENTRY:.*:]]
+; INT-NEXT:    store i32 2, ptr @G, align 4
+; INT-NEXT:    ret void
+;
+;
+; INT-LABEL: define private void @helper.internalized(
+; INT-SAME: ptr [[FN:%.*]]) #[[ATTR0:[0-9]+]] {
+; INT-NEXT:  [[ENTRY:.*:]]
+; INT-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[FN]], @callee1
+; INT-NEXT:    br i1 [[TMP0]], label %[[BB1:.*]], label %[[BB2:.*]]
+; INT:       [[BB1]]:
+; INT-NEXT:    call void @callee1()
+; INT-NEXT:    br label %[[BB5:.*]]
+; INT:       [[BB2]]:
+; INT-NEXT:    br i1 true, label %[[BB3:.*]], label %[[BB4:.*]]
+; INT:       [[BB3]]:
+; INT-NEXT:    call void @callee0()
+; INT-NEXT:    br label %[[BB5]]
+; INT:       [[BB4]]:
+; INT-NEXT:    unreachable
+; INT:       [[BB5]]:
+; INT-NEXT:    ret void
+;
+;
+; INT-LABEL: define void @helper(
+; INT-SAME: ptr [[FN:%.*]]) {
+; INT-NEXT:  [[ENTRY:.*:]]
+; INT-NEXT:    call void [[FN]]()
+; INT-NEXT:    ret void
+;
+;
+; INT-LABEL: define ptr @take_calle2_address(
+; INT-SAME: ) #[[ATTR1:[0-9]+]] {
+; INT-NEXT:  [[ENTRY:.*:]]
+; INT-NEXT:    ret ptr @callee2
+;
+;
+; INT-LABEL: define amdgpu_kernel void @foo(
+; INT-SAME: i1 [[VAL:%.*]]) #[[ATTR0]] {
+; INT-NEXT:  [[ENTRY:.*:]]
+; INT-NEXT:    [[FN:%.*]] = select i1 [[VAL]], ptr @callee0, ptr @callee1
+; INT-NEXT:    call void @helper.internalized(ptr [[FN]])
+; INT-NEXT:    ret void
+;

``````````

</details>


https://github.com/llvm/llvm-project/pull/108420