[llvm] [AMDGPU] Fix module split's assumption on kernels (PR #116280)

Siu Chi Chan via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 29 07:56:50 PST 2024


https://github.com/scchan updated https://github.com/llvm/llvm-project/pull/116280

>From 9ba9fd2da5507ae97835b2ded98ef1d3bacbd292 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan at amd.com>
Date: Thu, 14 Nov 2024 01:01:44 +0000
Subject: [PATCH] [AMDGPU] Fix module split's assumption on kernels

Module split assumes that a kernel function must have an external
linkage; however, that isn't the case.  For example, a static kernel
function will have a weak_odr linkage

Change-Id: I1e5dee0de1fd866b365f4090a574e1b2961f8dca
---
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp  | 11 +--
 .../AMDGPU/large-kernels-merging-weak_odr.ll  | 94 +++++++++++++++++++
 2 files changed, 99 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 3332ed9c006e54..09e7c5389cdbf9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -158,13 +158,12 @@ static auto formatRatioOf(CostType Num, CostType Dem) {
 /// Non-copyable functions cannot be cloned into multiple partitions, and only
 /// one copy of the function can be present across all partitions.
 ///
-/// External functions fall into this category. If we were to clone them, we
-/// would end up with multiple symbol definitions and a very unhappy linker.
+/// Kernel functions and external functions fall into this category. If we were
+/// to clone them, we would end up with multiple symbol definitions and a very
+/// unhappy linker.
 static bool isNonCopyable(const Function &F) {
-  assert(AMDGPU::isEntryFunctionCC(F.getCallingConv())
-             ? F.hasExternalLinkage()
-             : true && "Kernel w/o external linkage?");
-  return F.hasExternalLinkage() || !F.isDefinitionExact();
+  return AMDGPU::isEntryFunctionCC(F.getCallingConv()) ||
+         F.hasExternalLinkage() || !F.isDefinitionExact();
 }
 
 /// If \p GV has local linkage, make it external + hidden.
diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll
new file mode 100644
index 00000000000000..839688e7feb8bb
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll
@@ -0,0 +1,94 @@
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+
+; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0
+; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 --implicit-check-not=define %s
+
+; 2 kernels (A/B) are large and share all their dependencies.
+; They should go in the same partition, the remaining kernel should
+; go somewhere else, and one partition should be empty.
+;
+; Also check w/o large kernels processing to verify they are indeed handled
+; differently.
+
+; P0 is empty
+; CHECK0: declare
+
+; CHECK1: define internal void @HelperC()
+; CHECK1: define weak_odr amdgpu_kernel void @C
+
+; CHECK2: define internal void @large2()
+; CHECK2: define internal void @large1()
+; CHECK2: define internal void @large0()
+; CHECK2: define internal void @HelperA()
+; CHECK2: define internal void @HelperB()
+; CHECK2: define amdgpu_kernel void @A
+; CHECK2: define weak_odr amdgpu_kernel void @B
+
+; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
+; NOLARGEKERNELS-CHECK0: define weak_odr amdgpu_kernel void @C
+
+; NOLARGEKERNELS-CHECK1: define internal void @large2()
+; NOLARGEKERNELS-CHECK1: define internal void @large1()
+; NOLARGEKERNELS-CHECK1: define internal void @large0()
+; NOLARGEKERNELS-CHECK1: define internal void @HelperB()
+; NOLARGEKERNELS-CHECK1: define weak_odr amdgpu_kernel void @B
+
+; NOLARGEKERNELS-CHECK2: define internal void @large2()
+; NOLARGEKERNELS-CHECK2: define internal void @large1()
+; NOLARGEKERNELS-CHECK2: define internal void @large0()
+; NOLARGEKERNELS-CHECK2: define internal void @HelperA()
+; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A
+
+
+define internal void @large2() {
+  store volatile i32 42, ptr null
+  call void @large2()
+  ret void
+}
+
+define internal void @large1() {
+  call void @large1()
+  call void @large2()
+  ret void
+}
+
+define internal void @large0() {
+  call void @large0()
+  call void @large1()
+  call void @large2()
+  ret void
+}
+
+define internal void @HelperA() {
+  call void @large0()
+  ret void
+}
+
+define internal void @HelperB() {
+  call void @large0()
+  ret void
+}
+
+define amdgpu_kernel void @A() {
+  call void @HelperA()
+  ret void
+}
+
+define weak_odr amdgpu_kernel void @B() {
+  call void @HelperB()
+  ret void
+}
+
+define internal void @HelperC() {
+  ret void
+}
+
+define weak_odr amdgpu_kernel void @C() {
+  call void @HelperC()
+  ret void
+}



More information about the llvm-commits mailing list