[clang] [clang][AMDGPU] Enable module splitting by default (PR #128509)

Pierre van Houtryve via cfe-commits cfe-commits at lists.llvm.org
Fri Feb 28 01:25:03 PST 2025


https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/128509

>From cae772441c0d87a017f5cc2cb0b9c970c6b7fcde Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 24 Feb 2025 14:21:49 +0100
Subject: [PATCH 1/4] [clang][AMDGPU] Enable module splitting by default

The default number of partitions is the number of cores on the machine with a cap at 16, as going above 16 is unlikely to be useful in the common case.

Adds a flto-partitions option to override the number of partitions easily (without having to use -Xoffload-linker). Setting it to 1 effectively disables module splitting.

Fixes SWDEV-506214
---
 clang/include/clang/Driver/Options.td         |  6 ++--
 clang/lib/Driver/ToolChains/AMDGPU.cpp        | 36 +++++++++++++++++--
 clang/lib/Driver/ToolChains/AMDGPU.h          |  2 ++
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |  2 ++
 clang/test/Driver/amdgpu-toolchain.c          | 20 +++++++++--
 .../hip-toolchain-rdc-flto-partitions.hip     | 35 ++++++++++++++++++
 .../Driver/hip-toolchain-rdc-static-lib.hip   |  2 ++
 clang/test/Driver/hip-toolchain-rdc.hip       |  1 +
 8 files changed, 98 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index e521cbf678d93..6cb2fd1b755c1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1393,6 +1393,8 @@ def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">,
   HelpText<"Compile HIP source to relocatable">;
 def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
   HelpText<"Do not override toolchain to compile HIP source to relocatable">;
+def flto_partitions_EQ : Joined<["--"], "flto-partitions=">, Group<hip_Group>,
+  HelpText<"Number of partitions to use for parallel full LTO codegen. Use 1 to disable partitioning.">;
 }
 
 // Clang specific/exclusive options for OpenACC.
@@ -3158,7 +3160,7 @@ def modules_reduced_bmi : Flag<["-"], "fmodules-reduced-bmi">,
   HelpText<"Generate the reduced BMI">,
   MarshallingInfoFlag<FrontendOpts<"GenReducedBMI">>;
 
-def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">, 
+def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">,
   Group<f_Group>, Visibility<[ClangOption, CC1Option]>, Alias<modules_reduced_bmi>;
 
 def fmodules_embed_all_files : Joined<["-"], "fmodules-embed-all-files">,
@@ -7417,7 +7419,7 @@ def fuse_register_sized_bitfield_access: Flag<["-"], "fuse-register-sized-bitfie
 def relaxed_aliasing : Flag<["-"], "relaxed-aliasing">,
   HelpText<"Turn off Type Based Alias Analysis">,
   MarshallingInfoFlag<CodeGenOpts<"RelaxedAliasing">>;
-defm pointer_tbaa: BoolOption<"", "pointer-tbaa", CodeGenOpts<"PointerTBAA">, 
+defm pointer_tbaa: BoolOption<"", "pointer-tbaa", CodeGenOpts<"PointerTBAA">,
 DefaultTrue,
   PosFlag<SetTrue, [], [ClangOption], "Enable">,
   NegFlag<SetFalse, [], [ClangOption], "Disable">,
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 6a35a2feabc9b..820a335a4b384 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/TargetParser/Host.h"
 #include <optional>
@@ -630,8 +631,11 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   getToolChain().AddFilePathLibArgs(Args, CmdArgs);
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
   if (C.getDriver().isUsingLTO()) {
-    addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
-                  C.getDriver().getLTOMode() == LTOK_Thin);
+    const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin);
+    addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO);
+
+    if (!ThinLTO)
+      addFullLTOPartitionOption(C.getDriver(), Args, CmdArgs);
   } else if (Args.hasArg(options::OPT_mcpu_EQ)) {
     CmdArgs.push_back(Args.MakeArgString(
         "-plugin-opt=mcpu=" +
@@ -708,6 +712,34 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
                             options::OPT_m_amdgpu_Features_Group);
 }
 
+static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) {
+  const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ);
+  // In the absence of an option, use the number of available threads with a cap
+  // at 16 partitions. More than 16 partitions rarely benefits code splitting
+  // and can lead to more empty/small modules each with their own overhead.
+  if (!A)
+    return std::max(16u, llvm::hardware_concurrency().compute_thread_count());
+  int Value;
+  if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) {
+    D.Diag(diag::err_drv_invalid_int_value)
+        << A->getAsString(Args) << A->getValue();
+    return 1;
+  }
+
+  return Value;
+}
+
+void amdgpu::addFullLTOPartitionOption(const Driver &D,
+                                       const llvm::opt::ArgList &Args,
+                                       llvm::opt::ArgStringList &CmdArgs) {
+  // TODO: restrict to gpu-rdc only?
+
+  if (unsigned NumParts = GetFullLTOPartitions(D, Args); NumParts > 1) {
+    CmdArgs.push_back(
+        Args.MakeArgString("--lto-partitions=" + std::to_string(NumParts)));
+  }
+}
+
 /// AMDGPU Toolchain
 AMDGPUToolChain::AMDGPUToolChain(const Driver &D, const llvm::Triple &Triple,
                                  const ArgList &Args)
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index bc941a40445ad..08bd4fa556f78 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -41,6 +41,8 @@ void getAMDGPUTargetFeatures(const Driver &D, const llvm::Triple &Triple,
                              const llvm::opt::ArgList &Args,
                              std::vector<StringRef> &Features);
 
+void addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args,
+                               llvm::opt::ArgStringList &CmdArgs);
 } // end namespace amdgpu
 } // end namespace tools
 
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 271626ed54aed..4f0511b272a98 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -116,6 +116,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
 
   addLinkerCompressDebugSectionsOption(TC, Args, LldArgs);
 
+  amdgpu::addFullLTOPartitionOption(D, Args, LldArgs);
+
   // Given that host and device linking happen in separate processes, the device
   // linker doesn't always have the visibility as to which device symbols are
   // needed by a program, especially for the device symbol dependencies that are
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index c1c5aa8e90e68..6617108e59fcf 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -19,10 +19,12 @@
 // AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all"
 
 // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
-// RUN:   -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s
+// RUN:   -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s
+// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
+// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions={{[0-9]+}}"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"
+
 // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
 // RUN:   -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s
-// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
 // MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"
 
 // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
@@ -36,3 +38,17 @@
 // RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \
 // RUN:   -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s
 // STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o"
+
+// Check --flto-partitions
+
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
+// RUN:   -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s
+// LTO_PARTS: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions=42"
+
+// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
+// RUN:   -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s
+// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'
+
+// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
+// RUN:   -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s
+// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'
diff --git a/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip
new file mode 100644
index 0000000000000..e345bd3f5be6b
--- /dev/null
+++ b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip
@@ -0,0 +1,35 @@
+// RUN: %clang -### --target=x86_64-linux-gnu \
+// RUN:   -x hip --cuda-gpu-arch=gfx803 --flto-partitions=42 \
+// RUN:   --no-offload-new-driver --emit-static-lib -nogpulib \
+// RUN:   -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefix=FIXED-PARTS
+
+// FIXED-PARTS-NOT: "*.llvm-link"
+// FIXED-PARTS-NOT: ".*opt"
+// FIXED-PARTS-NOT: ".*llc"
+// FIXED-PARTS: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
+// FIXED-PARTS-SAME: "-plugin-opt=mcpu=gfx803"
+// FIXED-PARTS-SAME: "--lto-partitions=42"
+// FIXED-PARTS-SAME: "-o" "{{.*out}}" "{{.*bc}}"
+
+// RUN: not %clang -### --target=x86_64-linux-gnu \
+// RUN:   -x hip --cuda-gpu-arch=gfx803 --flto-partitions=a \
+// RUN:   --no-offload-new-driver --emit-static-lib -nogpulib \
+// RUN:   -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV0
+
+// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'
+
+// RUN: not %clang -### --target=x86_64-linux-gnu \
+// RUN:   -x hip --cuda-gpu-arch=gfx803 --flto-partitions=0 \
+// RUN:   --no-offload-new-driver --emit-static-lib -nogpulib \
+// RUN:   -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV1
+
+// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'
diff --git a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
index 5276faf31bdc2..6f38a06f7cf31 100644
--- a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
+++ b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
@@ -49,6 +49,7 @@
 // CHECK-NOT: ".*llc"
 // CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
 // CHECK-SAME: "-plugin-opt=mcpu=gfx803"
+// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
 // CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]]
 
 // generate image for device side path on gfx900
@@ -77,6 +78,7 @@
 // CHECK-NOT: ".*llc"
 // CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
 // CHECK-SAME: "-plugin-opt=mcpu=gfx900"
+// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
 // CHECK-SAME: "--whole-archive"
 // CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]]
 // CHECK-SAME: "--no-whole-archive"
diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip
index 96da423144c1c..9015702e3211a 100644
--- a/clang/test/Driver/hip-toolchain-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-rdc.hip
@@ -147,6 +147,7 @@
 // CHECK-NOT: ".*llc"
 // CHECK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
 // CHECK-SAME: "-plugin-opt=mcpu=gfx900"
+// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
 // CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]]
 
 // combine images generated into hip fat binary object

>From 10c286ad400bb8db4fd624c67671a00804f9a2a6 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 25 Feb 2025 08:17:29 +0100
Subject: [PATCH 2/4] use twine

---
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 820a335a4b384..2482be0eb66fb 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -736,7 +736,7 @@ void amdgpu::addFullLTOPartitionOption(const Driver &D,
 
   if (unsigned NumParts = GetFullLTOPartitions(D, Args); NumParts > 1) {
     CmdArgs.push_back(
-        Args.MakeArgString("--lto-partitions=" + std::to_string(NumParts)));
+        Args.MakeArgString("--lto-partitions=" + Twine(NumParts)));
   }
 }
 

>From ff26938b1ecb911003f026321823e5fa9a8deae7 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 25 Feb 2025 10:01:44 +0100
Subject: [PATCH 3/4] max -> min

---
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 2482be0eb66fb..3cc2f7fc99ec2 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -718,7 +718,7 @@ static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) {
   // at 16 partitions. More than 16 partitions rarely benefits code splitting
   // and can lead to more empty/small modules each with their own overhead.
   if (!A)
-    return std::max(16u, llvm::hardware_concurrency().compute_thread_count());
+    return std::min(16u, llvm::hardware_concurrency().compute_thread_count());
   int Value;
   if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) {
     D.Diag(diag::err_drv_invalid_int_value)

>From 315617dff8eda196152c089e27d4cbf17ac60daf Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 28 Feb 2025 10:24:50 +0100
Subject: [PATCH 4/4] Comments

---
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 3cc2f7fc99ec2..8831fb1c17634 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -712,14 +712,14 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
                             options::OPT_m_amdgpu_Features_Group);
 }
 
-static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) {
+static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) {
   const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ);
   // In the absence of an option, use the number of available threads with a cap
   // at 16 partitions. More than 16 partitions rarely benefits code splitting
   // and can lead to more empty/small modules each with their own overhead.
   if (!A)
     return std::min(16u, llvm::hardware_concurrency().compute_thread_count());
-  int Value;
+  int Value = 0;
   if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) {
     D.Diag(diag::err_drv_invalid_int_value)
         << A->getAsString(Args) << A->getValue();
@@ -732,9 +732,10 @@ static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) {
 void amdgpu::addFullLTOPartitionOption(const Driver &D,
                                        const llvm::opt::ArgList &Args,
                                        llvm::opt::ArgStringList &CmdArgs) {
-  // TODO: restrict to gpu-rdc only?
+  // TODO: Should this be restricted to fgpu-rdc only ? Currently we'll
+  //       also do it for non gpu-rdc LTO
 
-  if (unsigned NumParts = GetFullLTOPartitions(D, Args); NumParts > 1) {
+  if (unsigned NumParts = getFullLTOPartitions(D, Args); NumParts > 1) {
     CmdArgs.push_back(
         Args.MakeArgString("--lto-partitions=" + Twine(NumParts)));
   }



More information about the cfe-commits mailing list