[llvm] [AMDGPU][SplitModule] Do not create empty modules (PR #135761)

via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 15 01:41:17 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

<details>
<summary>Changes</summary>

Skip creating a module if no function is going to be imported.
Also includes a change so that if the first partition is empty (which can happen),
we import global with non-local linkage into the first non-empty partition, instead
of P0 all the time.

I thought we'd need to change users of the SplitModule callback so they can deal with less modules
than the number requested, but no. We already return only 1 module in some cases and
it seems to be handled just fine.

Fixes SWDEV-523146

---
Full diff: https://github.com/llvm/llvm-project/pull/135761.diff


6 Files Affected:

- (modified) llvm/include/llvm/Target/TargetMachine.h (+3) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp (+14-1) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll (+4-8) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll (+10-12) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll (+10-12) 
- (added) llvm/test/tools/llvm-split/AMDGPU/preserve-globals.ll (+21) 


``````````diff
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 5a16c9cafcd7a..566e7dba6792b 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -451,6 +451,9 @@ class TargetMachine {
   /// Entry point for module splitting. Targets can implement custom module
   /// splitting logic, mainly used by LTO for --lto-partitions.
   ///
+  /// On success, this guarantees that between 1 and \p NumParts modules were
+  /// created and passed to \p ModuleCallBack.
+  ///
   /// \returns `true` if the module was split, `false` otherwise. When  `false`
   /// is returned, it is assumed that \p ModuleCallback has never been called
   /// and \p M has not been modified.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index dd3bec774ec67..b7b9814b93427 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -1478,6 +1478,10 @@ static void splitAMDGPUModule(
              << "' - Partition summaries will not be printed\n";
   }
 
+  // One module will import all GlobalValues that are not Functions
+  // and are not subject to conservative import.
+  bool ImportAllGVs = true;
+
   for (unsigned PID = 0; PID < NumParts; ++PID) {
     SplitModuleTimer SMT2("modules_creation",
                           "creating modules for each partition");
@@ -1487,6 +1491,13 @@ static void splitAMDGPUModule(
     for (unsigned NodeID : (*Proposal)[PID].set_bits())
       FnsInPart.insert(&SG.getNode(NodeID).getFunction());
 
+    // Don't create empty module, except for PID 0 because
+    if (FnsInPart.empty()) {
+      LLVM_DEBUG(dbgs() << "[split] P" << PID
+                        << " is empty, not creating module\n");
+      continue;
+    }
+
     ValueToValueMapTy VMap;
     CostType PartCost = 0;
     std::unique_ptr<Module> MPart(
@@ -1501,9 +1512,11 @@ static void splitAMDGPUModule(
           }
 
           // Everything else goes in the first partition.
-          return needsConservativeImport(GV) || PID == 0;
+          return needsConservativeImport(GV) || ImportAllGVs;
         }));
 
+    ImportAllGVs = false;
+
     // FIXME: Aliases aren't seen often, and their handling isn't perfect so
     // bugs are possible.
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
index 708b5a006be60..0f20a73b1928e 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
@@ -1,7 +1,6 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; 3 kernels:
 ;   - A does a direct call to HelperA
@@ -11,14 +10,11 @@
 ; The helper functions will get externalized, so C/A will end up
 ; in the same partition.
 
-; P0 is empty.
-; CHECK0: declare
+; CHECK0: define amdgpu_kernel void @B(ptr %dst)
 
-; CHECK1: define amdgpu_kernel void @B(ptr %dst)
-
-; CHECK2: define hidden void @HelperA()
-; CHECK2: define amdgpu_kernel void @A()
-; CHECK2: define amdgpu_kernel void @C()
+; CHECK1: define hidden void @HelperA()
+; CHECK1: define amdgpu_kernel void @A()
+; CHECK1: define amdgpu_kernel void @C()
 
 define internal void @HelperA() {
   ret void
diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll
index 839688e7feb8b..567275686fb9f 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll
@@ -1,7 +1,6 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0
 ; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s
@@ -15,19 +14,18 @@
 ; Also check w/o large kernels processing to verify they are indeed handled
 ; differently.
 
-; P0 is empty
-; CHECK0: declare
+; Only two partitions created for the first command.
 
-; CHECK1: define internal void @HelperC()
-; CHECK1: define weak_odr amdgpu_kernel void @C
+; CHECK0: define internal void @HelperC()
+; CHECK0: define weak_odr amdgpu_kernel void @C
 
-; CHECK2: define internal void @large2()
-; CHECK2: define internal void @large1()
-; CHECK2: define internal void @large0()
-; CHECK2: define internal void @HelperA()
-; CHECK2: define internal void @HelperB()
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define weak_odr amdgpu_kernel void @B
+; CHECK1: define internal void @large2()
+; CHECK1: define internal void @large1()
+; CHECK1: define internal void @large0()
+; CHECK1: define internal void @HelperA()
+; CHECK1: define internal void @HelperB()
+; CHECK1: define amdgpu_kernel void @A
+; CHECK1: define weak_odr amdgpu_kernel void @B
 
 ; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
 ; NOLARGEKERNELS-CHECK0: define weak_odr amdgpu_kernel void @C
diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
index 807fb2e5f33ce..35133d20c4e07 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
@@ -1,7 +1,6 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0
 ; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s
@@ -15,19 +14,18 @@
 ; Also check w/o large kernels processing to verify they are indeed handled
 ; differently.
 
-; P0 is empty
-; CHECK0: declare
+; Only 2 partitions for the first command.
 
-; CHECK1: define internal void @HelperC()
-; CHECK1: define amdgpu_kernel void @C
+; CHECK0: define internal void @HelperC()
+; CHECK0: define amdgpu_kernel void @C
 
-; CHECK2: define internal void @large2()
-; CHECK2: define internal void @large1()
-; CHECK2: define internal void @large0()
-; CHECK2: define internal void @HelperA()
-; CHECK2: define internal void @HelperB()
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define amdgpu_kernel void @B
+; CHECK1: define internal void @large2()
+; CHECK1: define internal void @large1()
+; CHECK1: define internal void @large0()
+; CHECK1: define internal void @HelperA()
+; CHECK1: define internal void @HelperB()
+; CHECK1: define amdgpu_kernel void @A
+; CHECK1: define amdgpu_kernel void @B
 
 ; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
 ; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C
diff --git a/llvm/test/tools/llvm-split/AMDGPU/preserve-globals.ll b/llvm/test/tools/llvm-split/AMDGPU/preserve-globals.ll
new file mode 100644
index 0000000000000..091010edd6be3
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/preserve-globals.ll
@@ -0,0 +1,21 @@
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+
+; Only 2 out of 3 partitions are created, check the external global is preserved in the first partition.
+
+; CHECK0: @foobar = linkonce_odr global i64 52
+; CHECK0: define amdgpu_kernel void @B
+
+; CHECK1-NOT: @foobar = linkonce_odr global i64 52
+; CHECK1: define amdgpu_kernel void @A
+
+ at foobar = linkonce_odr global i64 52
+
+define amdgpu_kernel void @A() {
+  ret void
+}
+
+define amdgpu_kernel void @B() {
+  ret void
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/135761


More information about the llvm-commits mailing list