[compiler-rt] [llvm] [openmp] [compiler-rt] Define GPU specific handling of profiling functions (PR #185763)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 10 14:49:33 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-pgo
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
Summary:
The changes in github.com/llvm/llvm-project/pull/185552 allowed us to
start building the standard `libclang_rt.profile.a` for GPU targets.
This PR expands this by adding an optimized GPU routine for counter
increment and removing the special-case handling of these functions in
the OpenMP runtime.
Vast majority of these functions are boilerplate, but we should be able
to do more interesting things with this in the future, like value or
memory profiling.
---
Full diff: https://github.com/llvm/llvm-project/pull/185763.diff
9 Files Affected:
- (modified) compiler-rt/lib/profile/CMakeLists.txt (+1)
- (modified) compiler-rt/lib/profile/InstrProfiling.h (+8)
- (added) compiler-rt/lib/profile/InstrProfilingPlatformGPU.c (+37)
- (modified) llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp (+12-2)
- (modified) offload/test/CMakeLists.txt (+1-1)
- (modified) offload/test/lit.cfg (+14-2)
- (modified) openmp/device/CMakeLists.txt (-1)
- (removed) openmp/device/include/Profiling.h (-21)
- (removed) openmp/device/src/Profiling.cpp (-18)
``````````diff
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 4cc2610cec870..86328b4c13922 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -74,6 +74,7 @@ set(PROFILE_SOURCES
InstrProfilingPlatformLinux.c
InstrProfilingPlatformOther.c
InstrProfilingPlatformWindows.c
+ InstrProfilingPlatformGPU.c
)
if (NOT COMPILER_RT_PROFILE_BAREMETAL)
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 187ef55ef3784..f01cbec44be64 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -166,6 +166,14 @@ void __llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data,
uint32_t CounterIndex,
uint64_t CounterValue);
+/*!
+ * \brief Wave-cooperative counter increment for GPU targets.
+ *
+ * Reduces per-lane atomic contention by electing a single lane per wave to
+ * perform the counter update.
+ */
+void __llvm_profile_instrument_gpu(uint64_t *Counter, uint64_t Step);
+
/*!
* \brief Write instrumentation data to the current file.
*
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
new file mode 100644
index 0000000000000..55bfe6f1aab92
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
@@ -0,0 +1,37 @@
+/*===- InstrProfilingPlatformGPU.c - GPU profiling support ----------------===*\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+
+// GPU-specific profiling functions for AMDGPU and NVPTX targets. This file
+// provides:
+//
+// Platform plumbing (section boundaries, binary IDs, VNodes) are handled by
+// InstrProfilingPlatformLinux.c via the COMPILER_RT_PROFILE_BAREMETAL path.
+
+#if defined(__NVPTX__) || defined(__AMDGPU__)
+
+#include "InstrProfiling.h"
+#include <gpuintrin.h>
+
+// Wave-cooperative counter increment. The instrumentation pass emits calls to
+// this in place of the default non-atomic load/add/store or atomicrmw sequence.
+COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter,
+ uint64_t step) {
+ uint64_t mask = __gpu_lane_mask();
+ if (__gpu_is_first_in_lane(mask))
+ __scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask),
+ __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+}
+
+// InstrProfilingValue.c is excluded from GPU builds (COMPILER_RT_PROFILE_
+// BAREMETAL) because value profiling requires malloc. The PGO instrumentation
+// pass may still emit calls to this for memory intrinsics, so provide a no-op
+// to prevent link errors.
+COMPILER_RT_VISIBILITY void
+__llvm_profile_instrument_memop(int64_t i, void *ptr, int32_t i2) {}
+
+#endif
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 6d1e6bf68e3f1..0506e1a946910 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1193,8 +1193,18 @@ void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
auto *Addr = getCounterAddress(Inc);
IRBuilder<> Builder(Inc);
- if (Options.Atomic || AtomicCounterUpdateAll ||
- (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) {
+ if (isGPUProfTarget(M)) {
+ auto *I64Ty = Builder.getInt64Ty();
+ auto *PtrTy = Builder.getPtrTy();
+ auto *CalleeTy =
+ FunctionType::get(Type::getVoidTy(M.getContext()),
+ {PtrTy, I64Ty}, false);
+ auto Callee = M.getOrInsertFunction(
+ "__llvm_profile_instrument_gpu", CalleeTy);
+ Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy);
+ Builder.CreateCall(Callee, {CastAddr, Inc->getStep()});
+ } else if (Options.Atomic || AtomicCounterUpdateAll ||
+ (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) {
Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
MaybeAlign(), AtomicOrdering::Monotonic);
} else {
diff --git a/offload/test/CMakeLists.txt b/offload/test/CMakeLists.txt
index 711621de9075d..69b4979177f54 100644
--- a/offload/test/CMakeLists.txt
+++ b/offload/test/CMakeLists.txt
@@ -12,7 +12,7 @@ else()
set(LIBOMPTARGET_DEBUG False)
endif()
-if (NOT OPENMP_STANDALONE_BUILD AND "compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
+if (NOT OPENMP_STANDALONE_BUILD)
set(LIBOMPTARGET_TEST_GPU_PGO True)
else()
set(LIBOMPTARGET_TEST_GPU_PGO False)
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 2d5d69167109d..d2ecc2524f1db 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -2,6 +2,7 @@
# Configuration file for the 'lit' test runner.
import os
+import glob
import lit.formats
# Tell pylint that we know config and lit_config exist somewhere.
@@ -133,8 +134,19 @@ if config.libomptarget_has_libc:
profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata")
if config.libomptarget_test_pgo:
- config.available_features.add('pgo')
- config.substitutions.append(("%profdata", profdata_path))
+ target = config.libomptarget_current_target
+ for suffix in ['-JIT-LTO', '-LTO']:
+ if target.endswith(suffix):
+ target = target[:-len(suffix)]
+ break
+ has_profile_rt = True
+ if target.startswith('amdgcn') or target.startswith('nvptx'):
+ has_profile_rt = bool(glob.glob(os.path.join(
+ config.llvm_lib_directory, 'clang', '*', 'lib', target,
+ 'libclang_rt.profile.a')))
+ if has_profile_rt:
+ config.available_features.add('pgo')
+ config.substitutions.append(("%profdata", profdata_path))
# Determine whether the test system supports unified memory.
# For CUDA, this is the case with compute capability 70 (Volta) or higher.
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 096a6fe0b6e7e..ff5a64fdd2f0f 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -16,7 +16,6 @@ set(src_files
${CMAKE_CURRENT_SOURCE_DIR}/src/Mapping.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Misc.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Parallelism.cpp
- ${CMAKE_CURRENT_SOURCE_DIR}/src/Profiling.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Reduction.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/State.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Synchronization.cpp
diff --git a/openmp/device/include/Profiling.h b/openmp/device/include/Profiling.h
deleted file mode 100644
index d994752254121..0000000000000
--- a/openmp/device/include/Profiling.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_PROFILING_H
-#define OMPTARGET_DEVICERTL_PROFILING_H
-
-extern "C" {
-void __llvm_profile_register_function(void *Ptr);
-void __llvm_profile_register_names_function(void *Ptr, long int I);
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
-}
-
-#endif
diff --git a/openmp/device/src/Profiling.cpp b/openmp/device/src/Profiling.cpp
deleted file mode 100644
index df141af5ebeea..0000000000000
--- a/openmp/device/src/Profiling.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===------- Profiling.cpp ---------------------------------------- C++ ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Profiling.h"
-
-extern "C" {
-
-// Provides empty implementations for certain functions in compiler-rt
-// that are emitted by the PGO instrumentation.
-void __llvm_profile_register_function(void *Ptr) {}
-void __llvm_profile_register_names_function(void *Ptr, long int I) {}
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
-}
``````````
</details>
https://github.com/llvm/llvm-project/pull/185763
More information about the llvm-commits
mailing list