[clang] 44e306e - [Clang] Correctly link and handle PGO options on the GPU (#185761)

Thu Mar 19 07:18:16 PDT 2026

Author: Joseph Huber
Date: 2026-03-19T09:18:10-05:00
New Revision: 44e306ecdb02bdf3fff067a39bf5a763c786a78d

URL: https://github.com/llvm/llvm-project/commit/44e306ecdb02bdf3fff067a39bf5a763c786a78d
DIFF: https://github.com/llvm/llvm-project/commit/44e306ecdb02bdf3fff067a39bf5a763c786a78d.diff

LOG: [Clang] Correctly link and handle PGO options on the GPU (#185761)

Summary:
Currently, the GPU targets ignore the standard profiling arguments. This
PR changes the behavior to use the standard handling, which links the in
the now-present `libclang_rt.profile.a` if the user built with the
compiler-rt support enabled. If it is not present this is a linker error
and we can always suppress with `-Xarch_host` and `-Xarch_device`.
Hopefully this doesn't cause some people pain if they're used to doing
`-fprofile-generate` on a CPU unguarded since it was a stange mix of a
no-op and not a no-op on the GPU until now.

Added: 
    clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/amdgcn-amd-amdhsa/libclang_rt.profile.a
    clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/nvptx64-nvidia-cuda/libclang_rt.profile.a

Modified: 
    clang/lib/Driver/ToolChains/AMDGPU.cpp
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/lib/Driver/ToolChains/Cuda.cpp
    clang/test/Driver/amdgpu-toolchain.c
    clang/test/Driver/cuda-cross-compiling.c
    clang/test/Driver/openmp-offload-gpu.c

Removed: 
    


################################################################################
diff  --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 7bbdb71b1e24f..54fbd86168602 100644

--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -632,6 +632,8 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ",")));
   }
 
+  getToolChain().addProfileRTLibs(Args, CmdArgs);
+
   if (Args.hasArg(options::OPT_stdlib))
     CmdArgs.append({"-lc", "-lm"});
   if (Args.hasArg(options::OPT_startfiles)) {

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 3b852528d92c4..6416baf9126ff 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9357,9 +9357,23 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
       OPT_flto_partitions_EQ,
       OPT_flto_EQ,
       OPT_hipspv_pass_plugin_EQ,
-      OPT_use_spirv_backend};
+      OPT_use_spirv_backend,
+      OPT_fprofile_generate,
+      OPT_fprofile_generate_EQ,
+      OPT_fprofile_instr_generate,
+      OPT_fprofile_instr_generate_EQ};
   const llvm::DenseSet<unsigned> LinkerOptions{OPT_mllvm, OPT_Zlinker_input};
   auto ShouldForwardForToolChain = [&](Arg *A, const ToolChain &TC) {
+    auto HasProfileRT = TC.getVFS().exists(
+        TC.getCompilerRT(Args, "profile", ToolChain::FT_Static));
+    // Don't forward profiling arguments if the toolchain doesn't support it.
+    // Without this check using it on the host would result in linker errors.
+    if (!HasProfileRT &&
+        (A->getOption().matches(OPT_fprofile_generate) ||
+         A->getOption().matches(OPT_fprofile_generate_EQ) ||
+         A->getOption().matches(OPT_fprofile_instr_generate) ||
+         A->getOption().matches(OPT_fprofile_instr_generate_EQ)))
+      return false;
     // Don't forward -mllvm to toolchains that don't support LLVM.
     return TC.HasNativeLLVMSupport() || A->getOption().getID() != OPT_mllvm;
   };

diff  --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index e0020176800fd..2ca8886936f6c 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -643,6 +643,8 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);
   CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
 
+  getToolChain().addProfileRTLibs(Args, CmdArgs);
+
   if (Args.hasArg(options::OPT_stdlib))
     CmdArgs.append({"-lc", "-lm"});
   if (Args.hasArg(options::OPT_startfiles)) {

diff  --git a/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/amdgcn-amd-amdhsa/libclang_rt.profile.a b/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/amdgcn-amd-amdhsa/libclang_rt.profile.a
new file mode 100644
index 0000000000000..e69de29bb2d1d

diff  --git a/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/nvptx64-nvidia-cuda/libclang_rt.profile.a b/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/nvptx64-nvidia-cuda/libclang_rt.profile.a
new file mode 100644
index 0000000000000..e69de29bb2d1d

diff  --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index 459c1bdac246f..2a48ca6bb7670 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -46,3 +46,9 @@
 // RUN:  --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode 2>&1 \
 // RUN: | FileCheck -check-prefix=DEVICE-LIBS %s
 // DEVICE-LIBS: "-mlink-builtin-bitcode" "[[ROCM_PATH:.+]]ockl.bc"
+
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
+// RUN:   -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:   -fprofile-generate %s 2>&1 | FileCheck -check-prefixes=PROFILE %s
+//      PROFILE: ld.lld
+// PROFILE-SAME: "[[RESOURCE_DIR:.+]]{{/|\\\\}}lib{{/|\\\\}}amdgcn-amd-amdhsa{{/|\\\\}}libclang_rt.profile.a"

diff  --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index ed2853cae3ccc..1dea9426f75ce 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -112,3 +112,9 @@
 // RUN:   -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=PATH %s
 
 // PATH: clang-nvlink-wrapper{{.*}}"--cuda-path={{.*}}/Inputs/CUDA/usr/local/cuda"
+
+// RUN: %clang -### --target=nvptx64-nvidia-cuda -march=sm_89 -nogpulib \
+// RUN:   -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:   -fprofile-generate %s 2>&1 | FileCheck -check-prefixes=PROFILE %s
+//      PROFILE: clang-nvlink-wrapper
+// PROFILE-SAME: "[[RESOURCE_DIR:.+]]{{/|\\\\}}lib{{/|\\\\}}nvptx64-nvidia-cuda{{/|\\\\}}libclang_rt.profile.a"

diff  --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index fb1bc9ffdbbd4..e057959d62044 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -410,3 +410,21 @@
 // RUN:   | FileCheck --check-prefix=SHOULD-EXTRACT %s
 //
 // SHOULD-EXTRACT: clang-linker-wrapper{{.*}}"--should-extract=gfx906"
+
+//
+// Check that `-fprofile-generate` flags are forwarded to link in the runtime
+// only if present in the resource directory.
+//
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
+// RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:     --offload-arch=gfx906 -fprofile-generate -nogpulib -nogpuinc %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=PROFILE %s
+//
+// PROFILE: clang-linker-wrapper{{.*}}--device-compiler=amdgcn-amd-amdhsa=-fprofile-generate
+// 
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
+// RUN:     -resource-dir=%S/Inputs/resource_dir \
+// RUN:     --offload-arch=gfx906 -fprofile-generate -nogpulib -nogpuinc %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=NO-PROFILE %s
+//
+// NO-PROFILE-NOT: --device-compiler=amdgcn-amd-amdhsa=-fprofile-generate