[clang] ed022d9 - [clang][AMDGPU] Enable module splitting by default (#128509)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Mar 24 06:43:14 PDT 2025
Author: Pierre van Houtryve
Date: 2025-03-24T14:43:08+01:00
New Revision: ed022d93b2fbfe52b7bdee786aa5cc49fa2323c4
URL: https://github.com/llvm/llvm-project/commit/ed022d93b2fbfe52b7bdee786aa5cc49fa2323c4
DIFF: https://github.com/llvm/llvm-project/commit/ed022d93b2fbfe52b7bdee786aa5cc49fa2323c4.diff
LOG: [clang][AMDGPU] Enable module splitting by default (#128509)
The default number of partitions is the number of cores on the machine
with a cap at 16, as going above 16 is unlikely to be useful in the
common case.
Adds a flto-partitions option to override the number of partitions
easily (without having to use -Xoffload-linker). Setting it to 1
effectively disables module splitting.
Fixes SWDEV-506214
Added:
clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip
Modified:
clang/include/clang/Driver/Options.td
clang/lib/Driver/ToolChains/AMDGPU.cpp
clang/lib/Driver/ToolChains/AMDGPU.h
clang/lib/Driver/ToolChains/HIPAMD.cpp
clang/test/Driver/amdgpu-toolchain.c
clang/test/Driver/hip-toolchain-rdc-static-lib.hip
clang/test/Driver/hip-toolchain-rdc.hip
Removed:
################################################################################
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index fbd5cf632c350..59a57c83c6b89 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1392,6 +1392,8 @@ def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">,
HelpText<"Compile HIP source to relocatable">;
def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
HelpText<"Do not override toolchain to compile HIP source to relocatable">;
+def flto_partitions_EQ : Joined<["--"], "flto-partitions=">, Group<hip_Group>,
+ HelpText<"Number of partitions to use for parallel full LTO codegen. Use 1 to disable partitioning.">;
}
// Clang specific/exclusive options for OpenACC.
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 6a35a2feabc9b..e919f4e941f47 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -630,8 +630,11 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
getToolChain().AddFilePathLibArgs(Args, CmdArgs);
AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
if (C.getDriver().isUsingLTO()) {
- addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
- C.getDriver().getLTOMode() == LTOK_Thin);
+ const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin);
+ addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO);
+
+ if (!ThinLTO)
+ addFullLTOPartitionOption(C.getDriver(), Args, CmdArgs);
} else if (Args.hasArg(options::OPT_mcpu_EQ)) {
CmdArgs.push_back(Args.MakeArgString(
"-plugin-opt=mcpu=" +
@@ -708,6 +711,33 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
options::OPT_m_amdgpu_Features_Group);
}
+static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) {
+ const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ);
+ // In the absence of an option, use 8 as the default.
+ if (!A)
+ return 8;
+ int Value = 0;
+ if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) {
+ D.Diag(diag::err_drv_invalid_int_value)
+ << A->getAsString(Args) << A->getValue();
+ return 1;
+ }
+
+ return Value;
+}
+
+void amdgpu::addFullLTOPartitionOption(const Driver &D,
+ const llvm::opt::ArgList &Args,
+ llvm::opt::ArgStringList &CmdArgs) {
+ // TODO: Should this be restricted to fgpu-rdc only ? Currently we'll
+ // also do it for non gpu-rdc LTO
+
+ if (unsigned NumParts = getFullLTOPartitions(D, Args); NumParts > 1) {
+ CmdArgs.push_back(
+ Args.MakeArgString("--lto-partitions=" + Twine(NumParts)));
+ }
+}
+
/// AMDGPU Toolchain
AMDGPUToolChain::AMDGPUToolChain(const Driver &D, const llvm::Triple &Triple,
const ArgList &Args)
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index bc941a40445ad..08bd4fa556f78 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -41,6 +41,8 @@ void getAMDGPUTargetFeatures(const Driver &D, const llvm::Triple &Triple,
const llvm::opt::ArgList &Args,
std::vector<StringRef> &Features);
+void addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args,
+ llvm::opt::ArgStringList &CmdArgs);
} // end namespace amdgpu
} // end namespace tools
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 55a8f2ca87de0..dc3300b00f9ff 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -116,6 +116,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
addLinkerCompressDebugSectionsOption(TC, Args, LldArgs);
+ amdgpu::addFullLTOPartitionOption(D, Args, LldArgs);
+
// Given that host and device linking happen in separate processes, the device
// linker doesn't always have the visibility as to which device symbols are
// needed by a program, especially for the device symbol dependencies that are
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index c1c5aa8e90e68..6617108e59fcf 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -19,10 +19,12 @@
// AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all"
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
-// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s
+// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s
+// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
+// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions={{[0-9]+}}"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"
+
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
// RUN: -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s
-// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
// MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
@@ -36,3 +38,17 @@
// RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \
// RUN: -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s
// STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o"
+
+// Check --flto-partitions
+
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
+// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s
+// LTO_PARTS: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions=42"
+
+// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
+// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s
+// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'
+
+// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
+// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s
+// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'
diff --git a/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip
new file mode 100644
index 0000000000000..e345bd3f5be6b
--- /dev/null
+++ b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip
@@ -0,0 +1,35 @@
+// RUN: %clang -### --target=x86_64-linux-gnu \
+// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=42 \
+// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
+// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
+// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefix=FIXED-PARTS
+
+// FIXED-PARTS-NOT: "*.llvm-link"
+// FIXED-PARTS-NOT: ".*opt"
+// FIXED-PARTS-NOT: ".*llc"
+// FIXED-PARTS: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
+// FIXED-PARTS-SAME: "-plugin-opt=mcpu=gfx803"
+// FIXED-PARTS-SAME: "--lto-partitions=42"
+// FIXED-PARTS-SAME: "-o" "{{.*out}}" "{{.*bc}}"
+
+// RUN: not %clang -### --target=x86_64-linux-gnu \
+// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=a \
+// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
+// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
+// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV0
+
+// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'
+
+// RUN: not %clang -### --target=x86_64-linux-gnu \
+// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=0 \
+// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
+// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
+// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV1
+
+// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'
diff --git a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
index 5276faf31bdc2..6f38a06f7cf31 100644
--- a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
+++ b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
@@ -49,6 +49,7 @@
// CHECK-NOT: ".*llc"
// CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
// CHECK-SAME: "-plugin-opt=mcpu=gfx803"
+// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
// CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]]
// generate image for device side path on gfx900
@@ -77,6 +78,7 @@
// CHECK-NOT: ".*llc"
// CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
+// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
// CHECK-SAME: "--whole-archive"
// CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]]
// CHECK-SAME: "--no-whole-archive"
diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip
index 96da423144c1c..9015702e3211a 100644
--- a/clang/test/Driver/hip-toolchain-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-rdc.hip
@@ -147,6 +147,7 @@
// CHECK-NOT: ".*llc"
// CHECK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
+// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
// CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]]
// combine images generated into hip fat binary object
More information about the cfe-commits
mailing list