[libc-commits] [clang] [compiler-rt] [libc] [llvm] [PGO][AMDGPU] Add offload profiling with uniformity-aware optimization (PR #177665)

Mon Apr 6 08:12:44 PDT 2026

================
@@ -0,0 +1,68 @@
+/*===- InstrProfilingGPU.c - GPU profile counter functions ----------------===*\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+
+#if defined(__AMDGPU__) || defined(__NVPTX__)
+
+#include <gpuintrin.h>
+#include <stdint.h>
+
+#define ATOMIC_ADD(ptr, val)                                                   \
+  __scoped_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE)
+
+/*
+ * Check if this block is sampled (PatternOverflow mode).
+ * Samples by matching lower bits of flat block ID to 0.
+ *
+ * sampling_bits: 0 = all blocks (100%)
+ *                1 = even blocks (50%)
+ *                2 = every 4th block (25%)
+ *                3 = every 8th block (12.5%)
+ */
+__attribute__((visibility("hidden"), used)) int
+__gpu_pgo_is_sampled(uint32_t sampling_bits) {
+  if (sampling_bits == 0)
+    return 1;
+
+  uint32_t gdx = __gpu_num_blocks_x();
+  uint32_t gdy = __gpu_num_blocks_y();
+  uint32_t block_id = __gpu_block_id_x() + __gpu_block_id_y() * gdx +
+                      __gpu_block_id_z() * gdx * gdy;
+
+  uint32_t mask = (1u << sampling_bits) - 1;
+  return (block_id & mask) == 0;
+}
+
+typedef uint64_t __attribute__((address_space(1))) * global_u64_ptr;
+
+/* Full wave mask: all lanes active */
+#define FULL_WAVE_MASK ((__gpu_num_lanes() == 64) ? ~0ULL : 0xFFFFFFFFULL)
+
+/*
+ * Per-BB warp-aggregate counter increment using atomic add.
+ * Elects one leader lane per wave, counts active lanes, leader atomically
+ * adds (step * active_lanes). Also updates uniform counter when all lanes
+ * in the wave are active.
+ */
+__attribute__((visibility("hidden"), used)) void
+__gpu_pgo_increment(global_u64_ptr counter, global_u64_ptr uniform_counter,
+                    int64_t step) {
+  uint64_t lane_mask = __gpu_lane_mask();
+  uint64_t active = __gpu_ballot(lane_mask, 1);
+  if (__gpu_is_first_in_lane(lane_mask)) {
+    int64_t count = (int64_t)__builtin_popcountg(active) * step;
+    ATOMIC_ADD(counter, count);
+    if (uniform_counter && active == FULL_WAVE_MASK)
+      ATOMIC_ADD(uniform_counter, count);
+  }
+}
+
+#if defined(__AMDGPU__)
+__attribute__((weak)) const int __oclc_ABI_version = 600;
----------------
yxsamliu wrote:

Removed. __llvm_profile_sampling_gpu now uses only __gpu_block_id_x() instead of the full 3D linearization (__gpu_num_blocks_x/y). The 3D version required __oclc_ABI_version via __builtin_amdgcn_workgroup_size_x, which is unavailable at link time since the profile runtime is linked after device libs are internalized. Using block_id_x alone is sufficient for sampling — blocks sharing the same x-index are sampled together in 3D grids, a minor uniformity loss that does not affect profile correctness.

https://github.com/llvm/llvm-project/pull/177665