[Openmp-commits] [compiler-rt] [llvm] [openmp] [compiler-rt] Define GPU specific handling of profiling functions (PR #185763)

Joseph Huber via Openmp-commits openmp-commits at lists.llvm.org
Thu Mar 19 07:20:28 PDT 2026


https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/185763

>From 6b413b2df6198795c4c2fef4a2e09413074a89db Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 10 Mar 2026 15:47:47 -0500
Subject: [PATCH] [compiler-rt] Define GPU specific handling of profiling
 functions

Summary:
The changes in github.com/llvm/llvm-project/pull/185552 allowed us to
start building the standard `libclang_rt.profile.a` for GPU targets.
This PR expands this by adding an optimized GPU routine for counter
increment and removing the special-case handling of these functions in
the OpenMP runtime.

Vast majority of these functions are boilerplate, but we should be able
to do more interesting things with this in the future, like value or
memory profiling.

Change unfirom counter

fix

Fix outdated name

comment

missing arg
---
 compiler-rt/lib/profile/CMakeLists.txt        |  1 +
 compiler-rt/lib/profile/InstrProfiling.h      | 10 +++++
 .../lib/profile/InstrProfilingPlatformGPU.c   | 42 +++++++++++++++++++
 .../Instrumentation/InstrProfiling.cpp        | 15 ++++++-
 offload/test/CMakeLists.txt                   |  6 ---
 offload/test/lit.cfg                          | 13 +++++-
 offload/test/lit.site.cfg.in                  |  1 -
 openmp/device/CMakeLists.txt                  |  1 -
 openmp/device/include/Profiling.h             | 21 ----------
 openmp/device/src/Profiling.cpp               | 18 --------
 10 files changed, 78 insertions(+), 50 deletions(-)
 create mode 100644 compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
 delete mode 100644 openmp/device/include/Profiling.h
 delete mode 100644 openmp/device/src/Profiling.cpp

diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 4cc2610cec870..86328b4c13922 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -74,6 +74,7 @@ set(PROFILE_SOURCES
   InstrProfilingPlatformLinux.c
   InstrProfilingPlatformOther.c
   InstrProfilingPlatformWindows.c
+  InstrProfilingPlatformGPU.c
   )
 
 if (NOT COMPILER_RT_PROFILE_BAREMETAL)
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 187ef55ef3784..54013d7e6568d 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -166,6 +166,16 @@ void __llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data,
                                             uint32_t CounterIndex,
                                             uint64_t CounterValue);
 
+/*!
+ * \brief Wave-cooperative counter increment for GPU targets.
+ *
+ * Reduces per-lane atomic contention by electing a single lane per wave to
+ * perform the counter update. \c Uniform is an optional counter tracking the
+ * number of uniform.
+ */
+void __llvm_profile_instrument_gpu(uint64_t *Counter, uint64_t *Uniform,
+                                   uint64_t Step);
+
 /*!
  * \brief Write instrumentation data to the current file.
  *
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
new file mode 100644
index 0000000000000..f3c2255c9de29
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
@@ -0,0 +1,42 @@
+/*===- InstrProfilingPlatformGPU.c - GPU profiling support ----------------===*\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+
+// GPU-specific profiling functions for AMDGPU and NVPTX targets. This file
+// provides:
+//
+// Platform plumbing (section boundaries, binary IDs, VNodes) are handled by
+// InstrProfilingPlatformLinux.c via the COMPILER_RT_PROFILE_BAREMETAL path.
+
+#if defined(__NVPTX__) || defined(__AMDGPU__)
+
+#include "InstrProfiling.h"
+#include <gpuintrin.h>
+
+// Indicates that the current wave is fully occupied.
+static bool is_uniform(uint64_t mask) {
+  const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes());
+  return mask == uniform_mask;
+}
+
+// Wave-cooperative counter increment. The instrumentation pass emits calls to
+// this in place of the default non-atomic load/add/store or atomicrmw sequence.
+// The optional uniform counter allows calculating wave uniformity if present.
+COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter,
+                                                          uint64_t *uniform,
+                                                          uint64_t step) {
+  uint64_t mask = __gpu_lane_mask();
+  if (__gpu_is_first_in_lane(mask)) {
+    __scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask),
+                              __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+    if (uniform && is_uniform(mask))
+      __scoped_atomic_fetch_add(uniform, step * __builtin_popcountg(mask),
+                                __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+  }
+}
+
+#endif
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 199b7357fa860..d1696f4afbe36 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1192,8 +1192,19 @@ void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
   auto *Addr = getCounterAddress(Inc);
 
   IRBuilder<> Builder(Inc);
-  if (Options.Atomic || AtomicCounterUpdateAll ||
-      (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) {
+  if (isGPUProfTarget(M)) {
+    auto *I64Ty = Builder.getInt64Ty();
+    auto *PtrTy = Builder.getPtrTy();
+    auto *CalleeTy = FunctionType::get(Type::getVoidTy(M.getContext()),
+                                       {PtrTy, PtrTy, I64Ty}, false);
+    auto Callee =
+        M.getOrInsertFunction("__llvm_profile_instrument_gpu", CalleeTy);
+    Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy);
+    Value *Uniform =
+        ConstantPointerNull::get(PointerType::getUnqual(M.getContext()));
+    Builder.CreateCall(Callee, {CastAddr, Uniform, Inc->getStep()});
+  } else if (Options.Atomic || AtomicCounterUpdateAll ||
+             (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) {
     Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
                             MaybeAlign(), AtomicOrdering::Monotonic);
   } else {
diff --git a/offload/test/CMakeLists.txt b/offload/test/CMakeLists.txt
index 40da2a7d573ee..434f25f512a49 100644
--- a/offload/test/CMakeLists.txt
+++ b/offload/test/CMakeLists.txt
@@ -12,12 +12,6 @@ else()
   set(LIBOMPTARGET_DEBUG False)
 endif()
 
-if ("compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
-  set(LIBOMPTARGET_TEST_GPU_PGO True)
-else()
-  set(LIBOMPTARGET_TEST_GPU_PGO False)
-endif()
-
 # Replace the space from user's input with ";" in case that CMake add escape
 # char into the lit command.
 string(REPLACE " " ";" LIBOMPTARGET_LIT_ARG_LIST "${LIBOMPTARGET_LIT_ARGS}")
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 2d5d69167109d..2226764f9aa97 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -2,6 +2,7 @@
 # Configuration file for the 'lit' test runner.
 
 import os
+import glob
 import lit.formats
 
 # Tell pylint that we know config and lit_config exist somewhere.
@@ -132,7 +133,17 @@ if config.libomptarget_has_libc:
   config.available_features.add('libc')
 
 profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata")
-if config.libomptarget_test_pgo:
+target = config.libomptarget_current_target
+for suffix in ['-JIT-LTO', '-LTO']:
+  if target.endswith(suffix):
+    target = target[:-len(suffix)]
+    break
+has_profile_rt = True
+if target.startswith('amdgcn') or target.startswith('nvptx'):
+  has_profile_rt = bool(glob.glob(os.path.join(
+      config.llvm_lib_directory, 'clang', '*', 'lib', target,
+      'libclang_rt.profile.a')))
+if has_profile_rt:
   config.available_features.add('pgo')
   config.substitutions.append(("%profdata", profdata_path))
 
diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in
index c8ba45c9683e2..47b1fbd185146 100644
--- a/offload/test/lit.site.cfg.in
+++ b/offload/test/lit.site.cfg.in
@@ -27,7 +27,6 @@ config.offload_device_info = "@OFFLOAD_DEVICE_INFO_EXECUTABLE@"
 config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
 config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@
 config.libomptarget_has_libc = @LIBOMPTARGET_GPU_LIBC_SUPPORT@
-config.libomptarget_test_pgo = @LIBOMPTARGET_TEST_GPU_PGO@
 config.offload_tblgen = "@OFFLOAD_TBLGEN_EXECUTABLE@"
 # Let the main config do the real work.
 lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 096a6fe0b6e7e..ff5a64fdd2f0f 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -16,7 +16,6 @@ set(src_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Mapping.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Misc.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Parallelism.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/Profiling.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Reduction.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/State.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Synchronization.cpp
diff --git a/openmp/device/include/Profiling.h b/openmp/device/include/Profiling.h
deleted file mode 100644
index d994752254121..0000000000000
--- a/openmp/device/include/Profiling.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_PROFILING_H
-#define OMPTARGET_DEVICERTL_PROFILING_H
-
-extern "C" {
-void __llvm_profile_register_function(void *Ptr);
-void __llvm_profile_register_names_function(void *Ptr, long int I);
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
-}
-
-#endif
diff --git a/openmp/device/src/Profiling.cpp b/openmp/device/src/Profiling.cpp
deleted file mode 100644
index df141af5ebeea..0000000000000
--- a/openmp/device/src/Profiling.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===------- Profiling.cpp ---------------------------------------- C++ ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Profiling.h"
-
-extern "C" {
-
-// Provides empty implementations for certain functions in compiler-rt
-// that are emitted by the PGO instrumentation.
-void __llvm_profile_register_function(void *Ptr) {}
-void __llvm_profile_register_names_function(void *Ptr, long int I) {}
-void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
-}



More information about the Openmp-commits mailing list