[llvm] [AMDGPU][SplitModule] Handle !callees metadata (PR #108802)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 16 06:31:27 PDT 2024


https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/108802

>From 366853401b4411a4ed4621f81c8a3282b22a4000 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 16 Sep 2024 09:41:49 +0200
Subject: [PATCH 1/3] [AMDGPU][SplitModule] Cleanup CallsExternal Handling

- Don't treat inline ASM as indirect calls
- Remove alias testing, which was broken (only working by pure luck right now) and isn't needed anyway. GlobalOpt should take care of them for us.
---
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp  | 91 ++++++++++++++-----
 .../AMDGPU/indirect-call-inline-asm.ll        | 41 +++++++++
 .../AMDGPU/kernels-alias-dependencies.ll      | 41 ---------
 .../AMDGPU/kernels-dependency-indirect.ll     | 12 ---
 4 files changed, 110 insertions(+), 75 deletions(-)
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll
 delete mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index e97a7f4e075f7f..e76fc937793669 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
@@ -103,6 +104,15 @@ static cl::opt<bool> NoExternalizeGlobals(
     cl::desc("disables externalization of global variable with local linkage; "
              "may cause globals to be duplicated which increases binary size"));
 
+static cl::opt<bool> NoExternalizeOnAddrTaken(
+    "amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden,
+    cl::desc(
+        "disables externalization of functions whose addresses are taken"));
+
+static cl::opt<bool> InlineAsmIsIndirectCall(
+    "amdgpu-module-splitting-inline-asm-is-indirect-call", cl::Hidden,
+    cl::desc("consider inline assembly as an indirect call"));
+
 static cl::opt<std::string>
     ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
                        cl::Hidden,
@@ -482,6 +492,9 @@ void SplitGraph::buildGraph(CallGraph &CG) {
       dbgs()
       << "[build graph] constructing graph representation of the input\n");
 
+  // FIXME(?): Is the callgraph really worth using if we have to iterate the
+  // function again whenever it fails to give us enough information?
+
   // We build the graph by just iterating all functions in the module and
   // working on their direct callees. At the end, all nodes should be linked
   // together as expected.
@@ -492,29 +505,52 @@ void SplitGraph::buildGraph(CallGraph &CG) {
       continue;
 
     // Look at direct callees and create the necessary edges in the graph.
-    bool HasIndirectCall = false;
-    Node &N = getNode(Cache, Fn);
+    SetVector<const Function *> DirectCallees;
+    bool CallsExternal = false;
     for (auto &CGEntry : *CG[&Fn]) {
       auto *CGNode = CGEntry.second;
-      auto *Callee = CGNode->getFunction();
-      if (!Callee) {
-        // TODO: Don't consider inline assembly as indirect calls.
-        if (CGNode == CG.getCallsExternalNode())
-          HasIndirectCall = true;
-        continue;
-      }
-
-      if (!Callee->isDeclaration())
-        createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
+      if (auto *Callee = CGNode->getFunction()) {
+        if (!Callee->isDeclaration())
+          DirectCallees.insert(Callee);
+      } else if (CGNode == CG.getCallsExternalNode())
+        CallsExternal = true;
     }
 
     // Keep track of this function if it contains an indirect call and/or if it
     // can be indirectly called.
-    if (HasIndirectCall) {
-      LLVM_DEBUG(dbgs() << "indirect call found in " << Fn.getName() << "\n");
-      FnsWithIndirectCalls.push_back(&Fn);
+    if (CallsExternal) {
+      LLVM_DEBUG(dbgs() << "  [!] callgraph is incomplete for " << Fn.getName()
+                        << " - analyzing function\n");
+
+      bool HasIndirectCall = false;
+      for (const auto &Inst : instructions(Fn)) {
+        // look at all calls without a direct callee.
+        if (const auto *CB = dyn_cast<CallBase>(&Inst);
+            CB && !CB->getCalledFunction()) {
+          // inline assembly can be ignored, unless InlineAsmIsIndirectCall is
+          // true.
+          if (CB->isInlineAsm()) {
+            if (InlineAsmIsIndirectCall)
+              HasIndirectCall = true;
+            LLVM_DEBUG(dbgs() << "    found inline assembly\n");
+            continue;
+          }
+
+          // everything else is handled conservatively.
+          HasIndirectCall = true;
+        }
+      }
+
+      if (HasIndirectCall) {
+        LLVM_DEBUG(dbgs() << "    indirect call found\n");
+        FnsWithIndirectCalls.push_back(&Fn);
+      }
     }
 
+    Node &N = getNode(Cache, Fn);
+    for (const auto *Callee : DirectCallees)
+      createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
+
     if (canBeIndirectlyCalled(Fn))
       IndirectlyCallableFns.push_back(&Fn);
   }
@@ -1326,13 +1362,23 @@ static void splitAMDGPUModule(
   //
   // Additionally, it guides partitioning to not duplicate this function if it's
   // called directly at some point.
-  for (auto &Fn : M) {
-    if (Fn.hasAddressTaken()) {
-      if (Fn.hasLocalLinkage()) {
-        LLVM_DEBUG(dbgs() << "[externalize] " << Fn.getName()
-                          << " because its address is taken\n");
+  //
+  // TODO: Could we be smarter about this ? This makes all functions whose
+  // addresses are taken non-copyable. We should probably model this type of
+  // constraint in the graph and use it to guide splitting, instead of
+  // externalizing like this. Maybe non-copyable should really mean "keep one
+  // visible copy, then internalize all other copies" for some functions?
+  if (!NoExternalizeOnAddrTaken) {
+    for (auto &Fn : M) {
+      // TODO: Should aliases count? Probably not but they're so rare I'm not
+      // sure it's worth fixing.
+      if (Fn.hasAddressTaken()) {
+        if (Fn.hasLocalLinkage()) {
+          LLVM_DEBUG(dbgs() << "[externalize] " << Fn.getName()
+                            << " because its address is taken\n");
+        }
+        externalize(Fn);
       }
-      externalize(Fn);
     }
   }
 
@@ -1368,7 +1414,8 @@ static void splitAMDGPUModule(
     dbgs() << "[graph] nodes:\n";
     for (const SplitGraph::Node *N : SG.nodes()) {
       dbgs() << "  - [" << N->getID() << "]: " << N->getName() << " "
-             << (N->isGraphEntryPoint() ? "(entry)" : "") << "\n";
+             << (N->isGraphEntryPoint() ? "(entry)" : "") << " "
+             << (N->isNonCopyable() ? "(noncopyable)" : "") << "\n";
     }
   });
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll b/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll
new file mode 100644
index 00000000000000..5c7ddaeecd7044
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll
@@ -0,0 +1,41 @@
+; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-address-taken
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+
+; RUN: llvm-split -o %t_as_indirect %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-address-taken -amdgpu-module-splitting-inline-asm-is-indirect-call
+; RUN: llvm-dis -o - %t_as_indirect0 | FileCheck --check-prefix=CHECK-INDIRECT0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_as_indirect1 | FileCheck --check-prefix=CHECK-INDIRECT1 --implicit-check-not=define %s
+
+; CHECK0: define internal void @HelperB
+; CHECK0: define amdgpu_kernel void @B
+
+; CHECK1: define internal void @HelperA()
+; CHECK1: define amdgpu_kernel void @A()
+
+; CHECK-INDIRECT0: define internal void @HelperB
+; CHECK-INDIRECT0: define amdgpu_kernel void @B
+
+; CHECK-INDIRECT1: define internal void @HelperA()
+; CHECK-INDIRECT1: define internal void @HelperB()
+; CHECK-INDIRECT1: define amdgpu_kernel void @A()
+
+ at addrthief = global [2 x ptr] [ptr @HelperA, ptr @HelperB]
+
+define internal void @HelperA() {
+  ret void
+}
+
+define internal void @HelperB() {
+  ret void
+}
+
+define amdgpu_kernel void @A() {
+  call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"()
+  call void @HelperA()
+  ret void
+}
+
+define amdgpu_kernel void @B(ptr %out) {
+  call void @HelperB()
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
deleted file mode 100644
index d7e84abd5f968d..00000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-
-; 3 kernels:
-;   - A calls nothing
-;   - B calls @PerryThePlatypus
-;   - C calls @Perry, an alias of @PerryThePlatypus
-;
-; We should see through the alias and put B/C in the same
-; partition.
-;
-; Additionally, @PerryThePlatypus gets externalized as
-; the alias counts as taking its address.
-
-; CHECK0: define amdgpu_kernel void @A
-
-; CHECK1: @Perry = internal alias ptr (), ptr @PerryThePlatypus
-; CHECK1: define hidden void @PerryThePlatypus()
-; CHECK1: define amdgpu_kernel void @B
-; CHECK1: define amdgpu_kernel void @C
-
- at Perry = internal alias ptr(), ptr @PerryThePlatypus
-
-define internal void @PerryThePlatypus() {
-  ret void
-}
-
-define amdgpu_kernel void @A() {
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  call void @PerryThePlatypus()
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  call void @Perry()
-  ret void
-}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
index 5be945bda48bf4..c2acb06d3e72e5 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
@@ -3,18 +3,6 @@
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
 ; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
-; We have 4 kernels:
-;   - Each kernel has an internal helper
-;   - @A and @B's helpers does an indirect call.
-;
-; We default to putting A/B in P0, alongside a copy
-; of all helpers who have their address taken.
-; The other kernels can still go into separate partitions.
-;
-; Note that dependency discovery shouldn't stop upon finding an
-; indirect call. HelperC/D should also end up in P0 as they
-; are dependencies of HelperB.
-
 ; CHECK0: define internal void @HelperD
 ; CHECK0: define amdgpu_kernel void @D
 

>From 953dd1f895669dea24cf9e17246bba707df1a815 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 29 Aug 2024 13:04:15 +0200
Subject: [PATCH 2/3] [AMDGPU][SplitModule] Handle !callees metadata

---
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp  | 29 +++++++-
 .../kernels-dependency-indirect-callee-md.ll  | 69 +++++++++++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect-callee-md.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index e76fc937793669..17bbce8c1c98de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -486,6 +486,27 @@ void SplitGraph::Node::visitAllDependencies(
   }
 }
 
+/// Checks if \p I has MD_callees and if it does, parse it and put the function
+/// in \p Callees.
+///
+/// \returns true if there was metadata and it was parsed correctly. false if
+/// there was no MD or if it contained unknown entries.
+static bool handleCalleesMD(const Instruction &I,
+                            SmallVector<Function *> &Callees) {
+  auto *MD = I.getMetadata(LLVMContext::MD_callees);
+  if (!MD)
+    return false;
+
+  for (const auto &Op : MD->operands()) {
+    Function *Callee = mdconst::extract_or_null<Function>(Op);
+    if (!Callee)
+      return false;
+    Callees.push_back(Callee);
+  }
+
+  return true;
+}
+
 void SplitGraph::buildGraph(CallGraph &CG) {
   SplitModuleTimer SMT("buildGraph", "graph construction");
   LLVM_DEBUG(
@@ -522,6 +543,8 @@ void SplitGraph::buildGraph(CallGraph &CG) {
       LLVM_DEBUG(dbgs() << "  [!] callgraph is incomplete for " << Fn.getName()
                         << " - analyzing function\n");
 
+      SmallVector<Function *> KnownCallees;
+
       bool HasIndirectCall = false;
       for (const auto &Inst : instructions(Fn)) {
         // look at all calls without a direct callee.
@@ -536,6 +559,9 @@ void SplitGraph::buildGraph(CallGraph &CG) {
             continue;
           }
 
+          if (handleCalleesMD(Inst, KnownCallees))
+            continue;
+
           // everything else is handled conservatively.
           HasIndirectCall = true;
         }
@@ -544,7 +570,8 @@ void SplitGraph::buildGraph(CallGraph &CG) {
       if (HasIndirectCall) {
         LLVM_DEBUG(dbgs() << "    indirect call found\n");
         FnsWithIndirectCalls.push_back(&Fn);
-      }
+      } else if (!KnownCallees.empty())
+        DirectCallees.insert(KnownCallees.begin(), KnownCallees.end());
     }
 
     Node &N = getNode(Cache, Fn);
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect-callee-md.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect-callee-md.ll
new file mode 100644
index 00000000000000..f1ed02b2502a06
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect-callee-md.ll
@@ -0,0 +1,69 @@
+; RUN: sed -s 's/_MD_/, !callees !{ptr @CallCandidate0}/' %s | llvm-split -o %t -j 3 -mtriple amdgcn-amd-amdhsa
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+
+; RUN: sed -s 's/_MD_//g' %s | llvm-split -o %t-nomd -j 3 -mtriple amdgcn-amd-amdhsa
+; RUN: llvm-dis -o - %t-nomd0 | FileCheck --check-prefix=CHECK-NOMD0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t-nomd1 | FileCheck --check-prefix=CHECK-NOMD1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t-nomd2 | FileCheck --check-prefix=CHECK-NOMD2 --implicit-check-not=define %s
+
+; CHECK0: define internal void @HelperC
+; CHECK0: define amdgpu_kernel void @C
+
+; CHECK1: define hidden void @CallCandidate1
+; CHECK1: define internal void @HelperB
+; CHECK1: define amdgpu_kernel void @B
+
+; CHECK2: define internal void @HelperA
+; CHECK2: define hidden void @CallCandidate0
+; CHECK2: define amdgpu_kernel void @A
+
+; CHECK-NOMD0: define internal void @HelperC
+; CHECK-NOMD0: define amdgpu_kernel void @C
+
+; CHECK-NOMD1: define internal void @HelperB
+; CHECK-NOMD1: define amdgpu_kernel void @B
+
+; CHECK-NOMD2: define internal void @HelperA
+; CHECK-NOMD2: define hidden void @CallCandidate0
+; CHECK-NOMD2: define hidden void @CallCandidate1
+; CHECK-NOMD2: define amdgpu_kernel void @A
+
+ at addrthief = global [2 x ptr] [ptr @CallCandidate0, ptr @CallCandidate1]
+
+define internal void @HelperA(ptr %call) {
+  call void %call() _MD_
+  ret void
+}
+
+define internal void @CallCandidate0() {
+  ret void
+}
+
+define internal void @CallCandidate1() {
+  ret void
+}
+
+define internal void @HelperB() {
+  ret void
+}
+
+define internal void @HelperC() {
+  ret void
+}
+
+define amdgpu_kernel void @A(ptr %call) {
+  call void @HelperA(ptr %call)
+  ret void
+}
+
+define amdgpu_kernel void @B() {
+  call void @HelperB()
+  ret void
+}
+
+define amdgpu_kernel void @C() {
+  call void @HelperC()
+  ret void
+}

>From f64e50a7500422698ffb8268a981d401c8f923ea Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 16 Sep 2024 15:31:15 +0200
Subject: [PATCH 3/3] Comments, minor refactor

---
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 40 ++++++++++----------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 17bbce8c1c98de..96f183f5e8304e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -492,7 +492,7 @@ void SplitGraph::Node::visitAllDependencies(
 /// \returns true if there was metadata and it was parsed correctly. false if
 /// there was no MD or if it contained unknown entries.
 static bool handleCalleesMD(const Instruction &I,
-                            SmallVector<Function *> &Callees) {
+                            SmallVectorImpl<Function *> &Callees) {
   auto *MD = I.getMetadata(LLVMContext::MD_callees);
   if (!MD)
     return false;
@@ -544,30 +544,32 @@ void SplitGraph::buildGraph(CallGraph &CG) {
                         << " - analyzing function\n");
 
       SmallVector<Function *> KnownCallees;
-
-      bool HasIndirectCall = false;
+      bool HasUnknownIndirectCall = false;
       for (const auto &Inst : instructions(Fn)) {
         // look at all calls without a direct callee.
-        if (const auto *CB = dyn_cast<CallBase>(&Inst);
-            CB && !CB->getCalledFunction()) {
-          // inline assembly can be ignored, unless InlineAsmIsIndirectCall is
-          // true.
-          if (CB->isInlineAsm()) {
-            if (InlineAsmIsIndirectCall)
-              HasIndirectCall = true;
-            LLVM_DEBUG(dbgs() << "    found inline assembly\n");
-            continue;
-          }
+        const auto *CB = dyn_cast<CallBase>(&Inst);
+        if (!CB || CB->getCalledFunction())
+          continue;
+
+        // inline assembly can be ignored, unless InlineAsmIsIndirectCall is
+        // true.
+        if (CB->isInlineAsm()) {
+          if (InlineAsmIsIndirectCall)
+            HasUnknownIndirectCall = true;
+          LLVM_DEBUG(dbgs() << "    found inline assembly\n");
+          continue;
+        }
 
-          if (handleCalleesMD(Inst, KnownCallees))
-            continue;
+        if (handleCalleesMD(Inst, KnownCallees))
+          continue;
 
-          // everything else is handled conservatively.
-          HasIndirectCall = true;
-        }
+        // Everything else is handled conservatively. If we fall into the
+        // conservative case don't bother analyzing further.
+        HasUnknownIndirectCall = true;
+        break;
       }
 
-      if (HasIndirectCall) {
+      if (HasUnknownIndirectCall) {
         LLVM_DEBUG(dbgs() << "    indirect call found\n");
         FnsWithIndirectCalls.push_back(&Fn);
       } else if (!KnownCallees.empty())



More information about the llvm-commits mailing list