[clang] [compiler-rt] [libc] [llvm] [PGO][AMDGPU] Add offload profiling with uniformity-aware optimization (PR #177665)

Yaxun Liu via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 2 17:55:50 PDT 2026


https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/177665

>From fbded3c0689de5435d6a992e5e44f332976c8cee Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Sat, 28 Mar 2026 12:40:34 -0400
Subject: [PATCH 1/2] [PGO][AMDGPU] Add offload profiling infrastructure for
 HIP

Adds device-side PGO support for HIP/AMDGPU targets:
- GPU profile runtime in compiler-rt (InstrProfilingPlatformROCm.c)
- Driver support for linking GPU profile library (-fprofile-generate)
- Section-based profile collection via offload plugin GlobalHandler
- Contiguous counter allocation with CUID-based deduplication
- Raw profile format v11 with GPU-specific fields
- Lit test updates for v11 format, GPU profile sections, and libc @BINARY@ fix
---
 clang/docs/HIPSupport.rst                     | 102 ++
 compiler-rt/include/profile/InstrProfData.inc |  31 +-
 compiler-rt/lib/profile/CMakeLists.txt        |   1 +
 compiler-rt/lib/profile/InstrProfiling.h      |  52 +-
 .../lib/profile/InstrProfilingBuffer.c        |  41 +-
 compiler-rt/lib/profile/InstrProfilingFile.c  |  73 +-
 .../lib/profile/InstrProfilingInternal.h      |  26 +-
 .../lib/profile/InstrProfilingPlatformGPU.c   |  48 +-
 .../lib/profile/InstrProfilingPlatformROCm.c  | 915 ++++++++++++++++++
 .../lib/profile/InstrProfilingWriter.c        |  38 +-
 .../test/asan/TestCases/log-path_test.cpp     |   6 +-
 .../test/memprof/TestCases/log_path_test.cpp  |  25 +-
 libc/test/lit.site.cfg.py.in                  |   4 +-
 llvm/include/llvm/IR/FixedMetadataKinds.def   |   1 +
 llvm/include/llvm/ProfileData/InstrProf.h     |  43 +-
 .../llvm/ProfileData/InstrProfData.inc        |  31 +-
 .../llvm/ProfileData/InstrProfReader.h        |   5 +
 .../llvm/ProfileData/InstrProfWriter.h        |   1 +
 .../llvm/Transforms/Instrumentation/CFGMST.h  |  26 +-
 llvm/lib/ProfileData/InstrProf.cpp            |  10 +-
 llvm/lib/ProfileData/InstrProfCorrelator.cpp  |   2 +
 llvm/lib/ProfileData/InstrProfReader.cpp      | 113 ++-
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  45 +-
 .../Instrumentation/InstrProfiling.cpp        | 629 +++++++++++-
 .../Instrumentation/PGOInstrumentation.cpp    |  88 +-
 .../AArch64/global-merge-profile-sections.ll  |   4 +-
 .../InstrProfiling/amdgpu-3d-grid.ll          |  24 +
 .../amdgpu-contiguous-counters.ll             |  35 +
 .../InstrProfiling/amdgpu-instrumentation.ll  |  57 ++
 .../InstrProfiling/amdgpu-uniform-counters.ll |  22 +
 .../InstrProfiling/amdgpu-wave32.ll           |  33 +
 .../InstrProfiling/amdgpu-wave64.ll           |  31 +
 .../InstrProfiling/coverage.ll                |   8 +-
 .../InstrProfiling/gpu-weak.ll                |  36 +
 .../InstrProfiling/inline-data-var-create.ll  |  23 +-
 .../InstrProfiling/platform.ll                |  14 +
 .../thinlto_indirect_call_promotion.profraw   | Bin 544 -> 600 bytes
 .../amdgpu-disable-value-profiling.ll         |  23 +
 .../Transforms/PGOProfile/comdat_internal.ll  |   4 +-
 .../instrprof_burst_sampling_fast.ll          |   2 +-
 .../Transforms/PGOProfile/vtable_profile.ll   |   2 +-
 .../llvm-profdata/Inputs/c-general.profraw    | Bin 2032 -> 2152 bytes
 .../llvm-profdata/Inputs/compressed.profraw   | Bin 1984 -> 2104 bytes
 .../llvm-profdata/binary-ids-padding.test     |  19 +-
 .../llvm-profdata/large-binary-id-size.test   |   9 +-
 ...alformed-not-space-for-another-header.test |   7 +-
 .../malformed-num-counters-zero.test          |   8 +-
 .../malformed-ptr-to-counter-array.test       |   8 +-
 .../misaligned-binary-ids-size.test           |   2 +-
 .../mismatched-raw-profile-header.test        |   3 +
 .../tools/llvm-profdata/profile-version.test  |   2 +-
 .../tools/llvm-profdata/raw-32-bits-be.test   |  19 +-
 .../tools/llvm-profdata/raw-32-bits-le.test   |  19 +-
 .../tools/llvm-profdata/raw-64-bits-be.test   |  21 +-
 .../tools/llvm-profdata/raw-64-bits-le.test   |  19 +-
 .../tools/llvm-profdata/raw-two-profiles.test |  12 +-
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  27 +
 .../common/include/GlobalHandler.h            |   8 +-
 .../common/src/GlobalHandler.cpp              |   6 +-
 59 files changed, 2627 insertions(+), 236 deletions(-)
 create mode 100644 compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-3d-grid.ll
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-uniform-counters.ll
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-wave32.ll
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-wave64.ll
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll

diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index e7f00202c34e7..6560fa5afdeee 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -412,6 +412,108 @@ Example Usage
    __host__ __device__ int Four(void) __attribute__((weak, alias("_Z6__Fourv")));
    __host__ __device__ float Four(float f) __attribute__((weak, alias("_Z6__Fourf")));
 
+Profile Guided Optimization (PGO)
+=================================
+
+Clang supports Profile Guided Optimization (PGO) for HIP, enabling optimization
+of both host and device code based on runtime execution profiles.
+
+Build Requirements
+------------------
+
+Device PGO requires ``libclang_rt.profile.a`` built for the AMDGPU target. This
+library provides the GPU profiling runtime (warp-aggregate counter increment with
+uniformity tracking). Building it requires LLVM libc for AMDGPU to provide
+standard C headers:
+
+.. code-block:: text
+
+   # Add to your LLVM cmake configuration:
+   -DLLVM_RUNTIME_TARGETS="default;amdgcn-amd-amdhsa"
+   -DRUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES="<src>/compiler-rt/cmake/caches/GPU.cmake"
+   -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES="compiler-rt;libc"
+
+The ``pyyaml`` Python package is also required (for LLVM libc header generation).
+The resulting library is installed at
+``<resource-dir>/lib/amdgcn-amd-amdhsa/libclang_rt.profile.a`` and linked
+automatically when ``-fprofile-generate`` is used.
+
+Workflow
+--------
+
+The PGO workflow consists of four steps:
+
+1. **Instrumented Build**: Compile with ``-fprofile-generate`` to create an
+   instrumented binary that collects execution profiles:
+
+   .. code-block:: shell
+
+      clang++ -O2 -fprofile-generate --offload-arch=gfx1200 -xhip app.hip -o app_instrumented
+
+2. **Profile Collection**: Run the instrumented binary with representative
+   workloads. This generates separate profile files for host and each GPU
+   architecture:
+
+   .. code-block:: shell
+
+      ./app_instrumented
+      # Creates: default_<id>.profraw (host)
+      #          default_<id>.<arch>.<tu>.profraw (device, e.g. default_12345.gfx1200.0.profraw)
+
+3. **Merge Profiles**: Use ``llvm-profdata`` to merge the raw profiles:
+
+   .. code-block:: shell
+
+      # Merge host profiles
+      llvm-profdata merge -o app.profdata default_*_0.profraw
+
+      # Merge device profiles (use the GPU arch name from your target)
+      llvm-profdata merge -o app.device.profdata \
+          default_*.gfx1200.*.profraw
+
+4. **Optimized Build**: Rebuild with ``-fprofile-use``, specifying separate
+   profile files for host and device using ``-Xarch_host`` and ``-Xarch_device``:
+
+   .. code-block:: shell
+
+      clang++ -O2 --offload-arch=gfx1200 -xhip app.hip -o app_optimized \
+          -Xarch_host -fprofile-use=app.profdata \
+          -Xarch_device -fprofile-use=app.device.profdata
+
+Sampling
+--------
+
+By default, device instrumentation samples 12.5% of thread blocks to reduce
+overhead (``-mllvm -offload-pgo-sampling=3``). This can be tuned:
+
+- ``-mllvm -offload-pgo-sampling=0`` — instrument all blocks (highest accuracy, ~3.4x overhead)
+- ``-mllvm -offload-pgo-sampling=3`` — instrument 12.5% of blocks (default, ~1.9x overhead)
+- ``-mllvm -offload-pgo-sampling=7`` — instrument ~0.8% of blocks (lowest overhead, ~1.7x)
+
+Higher sampling values reduce instrumentation overhead at the cost of sparser
+profiles. For kernels with uniform control flow (e.g., GEMM), even very sparse
+profiles produce the same PGO gains.
+
+Debug Output
+------------
+
+Set ``LLVM_PROFILE_VERBOSE=1`` to see diagnostic messages during profile
+collection:
+
+.. code-block:: shell
+
+   LLVM_PROFILE_VERBOSE=1 ./app_instrumented
+
+This shows information about profile data registration, device memory
+operations, and profile file creation.
+
+Limitations
+-----------
+
+- Device PGO is supported only on AMD GPUs with HIP.
+- Value profiling is not supported for device code.
+- Building the GPU profile library requires LLVM libc for AMDGPU (see Build Requirements).
+
 C++17 Class Template Argument Deduction (CTAD) Support
 ======================================================
 
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 7525feab8f133..c1aa4595b8736 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -78,6 +78,8 @@ INSTR_PROF_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), FuncHash, \
                 ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
                 Inc->getHash()->getZExtValue()))
 INSTR_PROF_DATA(const IntPtrT, IntPtrTy, CounterPtr, RelativeCounterPtr)
+INSTR_PROF_DATA(const IntPtrT, IntPtrTy, UniformCounterPtr, \
+                RelativeUniformCounterPtr)
 INSTR_PROF_DATA(const IntPtrT, IntPtrTy, BitmapPtr, RelativeBitmapPtr)
 /* This is used to map function pointers for the indirect call targets to
  * function name hashes during the conversion from raw to merged profile
@@ -89,9 +91,13 @@ INSTR_PROF_DATA(IntPtrT, llvm::PointerType::getUnqual(Ctx), Values, \
                 ValuesPtrExpr)
 INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumCounters, \
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumCounters))
-INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last+1], \
-                ConstantArray::get(Int16ArrayTy, Int16ArrayVals)) \
-INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
+INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last + 1],
+                ConstantArray::get(Int16ArrayTy, Int16ArrayVals))
+INSTR_PROF_DATA(const uint16_t, llvm::Type::getInt16Ty(Ctx),
+                OffloadDeviceWaveSize,
+                ConstantInt::get(llvm::Type::getInt16Ty(Ctx),
+                                 OffloadDeviceWaveSizeVal))
+INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes,
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumBitmapBytes))
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
@@ -168,6 +174,12 @@ INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \
 INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \
                     Ctx), DataStop,                                            \
                     ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
+INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx),                 \
+                    UniformCountersStart,                                       \
+                    ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
+INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx),                 \
+                    UniformCountersStop,                                        \
+                    ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
 INSTR_PROF_GPU_SECT(uint64_t *, llvm::PointerType::getUnqual(Ctx),             \
                     VersionVar,                                                \
                     ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
@@ -192,6 +204,10 @@ INSTR_PROF_RAW_HEADER(uint64_t, NumCounters, NumCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterCounters, PaddingBytesAfterCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, NumBitmapBytes, NumBitmapBytes)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterBitmapBytes, PaddingBytesAfterBitmapBytes)
+INSTR_PROF_RAW_HEADER(uint64_t, NumUniformCounters, NumUniformCounters)
+INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterUniformCounters, PaddingBytesAfterUniformCounters)
+INSTR_PROF_RAW_HEADER(uint64_t, UniformCountersDelta,
+    UniformCountersBegin ? (uintptr_t)UniformCountersBegin - (uintptr_t)DataBegin : 0)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
                       (uintptr_t)CountersBegin - (uintptr_t)DataBegin)
@@ -356,6 +372,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
 INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
                       INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
                       INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_ucnts, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_UCNTS_COMMON), \
+                      INSTR_PROF_UCNTS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
                       INSTR_PROF_BITS_COFF, "__DATA,")
@@ -752,9 +771,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 10
+#define INSTR_PROF_RAW_VERSION 11
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 13
+#define INSTR_PROF_INDEX_VERSION 14
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -803,6 +822,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
 #define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_UCNTS_COMMON __llvm_prf_ucnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
@@ -820,6 +840,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
 #define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
+#define INSTR_PROF_UCNTS_COFF ".lprfuc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 8c196d15841a4..4a7bccde4b685 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -89,6 +89,7 @@ if (NOT COMPILER_RT_PROFILE_BAREMETAL)
   list(APPEND PROFILE_SOURCES
     GCDAProfiling.c
     InstrProfilingFile.c
+    InstrProfilingPlatformROCm.c
     InstrProfilingRuntime.cpp
     InstrProfilingUtil.c
     InstrProfilingValue.c
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 1d22934bd6ef1..ab9b64779ef87 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -308,10 +308,12 @@ uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
  */
 int __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
-    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
-    uint64_t *PaddingBytesAfterBitmap, uint64_t *PaddingBytesAfterNames,
-    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVNames);
+    uint64_t NumUniformCounters, uint64_t NamesSize, uint64_t VTableSize,
+    uint64_t VNameSize, uint64_t *PaddingBytesBeforeCounters,
+    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmap,
+    uint64_t *PaddingBytesAfterUniformCounters,
+    uint64_t *PaddingBytesAfterNames, uint64_t *PaddingBytesAfterVTable,
+    uint64_t *PaddingBytesAfterVNames);
 
 /*!
  * \brief Set the flag that profile data has been dumped to the file.
@@ -323,14 +325,28 @@ void __llvm_profile_set_dumped(void);
 
 /*!
  * \brief Write custom target-specific profiling data to a separate file.
- * Used by offload PGO.
+ * Used by offload PGO (HIP and OpenMP).
+ *
+ * \param Target Target triple (e.g., "amdgcn-amd-amdhsa")
+ * \param TUSuffix TU index suffix (e.g., "0", "1") or NULL for no suffix
+ * \param DataBegin Start of profile data records
+ * \param DataEnd End of profile data records
+ * \param CountersBegin Start of counter data
+ * \param CountersEnd End of counter data
+ * \param UniformCountersBegin Start of uniform counters (NULL if not used)
+ * \param UniformCountersEnd End of uniform counters (NULL if not used)
+ * \param NamesBegin Start of names data
+ * \param NamesEnd End of names data
+ * \param VersionOverride Profile version override (NULL to use default)
  */
-int __llvm_write_custom_profile(const char *Target,
+int __llvm_write_custom_profile(const char *Target, const char *TUSuffix,
                                 const __llvm_profile_data *DataBegin,
                                 const __llvm_profile_data *DataEnd,
                                 const char *CountersBegin,
-                                const char *CountersEnd, const char *NamesBegin,
-                                const char *NamesEnd,
+                                const char *CountersEnd,
+                                const char *UniformCountersBegin,
+                                const char *UniformCountersEnd,
+                                const char *NamesBegin, const char *NamesEnd,
                                 const uint64_t *VersionOverride);
 
 /*!
@@ -364,4 +380,24 @@ extern char INSTR_PROF_PROFILE_NAME_VAR[1]; /* __llvm_profile_filename. */
 
 const __llvm_gcov_init_func_struct *__llvm_profile_begin_covinit();
 const __llvm_gcov_init_func_struct *__llvm_profile_end_covinit();
+
+/* A struct to hold the device pointers and sizes for the profile sections. */
+typedef struct OffloadProfileSectionInfo {
+  void *CountersBegin;
+  size_t CountersSize;
+  void *DataBegin;
+  size_t DataSize;
+  void *NamesBegin;
+  size_t NamesSize;
+} OffloadProfileSectionInfo;
+
+/*!
+ * \brief Register an offload module's device-side profile data sections.
+ *
+ * This function is called by the host-side instrumentation code to provide
+ * the runtime with the necessary information to collect profile data from
+ * the device.
+ */
+void __llvm_profile_offload_register_module(OffloadProfileSectionInfo *Info);
+
 #endif /* PROFILE_INSTRPROFILING_H_ */
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index b406e8db74f3f..abaf2a5fbfee0 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -147,17 +147,21 @@ static int needsCounterPadding(void) {
 COMPILER_RT_VISIBILITY
 int __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
-    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
-    uint64_t *PaddingBytesAfterBitmapBytes, uint64_t *PaddingBytesAfterNames,
-    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVName) {
-  // Counter padding is needed only if continuous mode is enabled.
+    uint64_t NumUniformCounters, uint64_t NamesSize, uint64_t VTableSize,
+    uint64_t VNameSize, uint64_t *PaddingBytesBeforeCounters,
+    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmapBytes,
+    uint64_t *PaddingBytesAfterUniformCounters,
+    uint64_t *PaddingBytesAfterNames, uint64_t *PaddingBytesAfterVTable,
+    uint64_t *PaddingBytesAfterVName) {
   if (!needsCounterPadding()) {
     *PaddingBytesBeforeCounters = 0;
     *PaddingBytesAfterCounters =
         __llvm_profile_get_num_padding_bytes(CountersSize);
     *PaddingBytesAfterBitmapBytes =
         __llvm_profile_get_num_padding_bytes(NumBitmapBytes);
+    if (PaddingBytesAfterUniformCounters != NULL)
+      *PaddingBytesAfterUniformCounters = __llvm_profile_get_num_padding_bytes(
+          NumUniformCounters * sizeof(uint64_t));
     *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize);
     if (PaddingBytesAfterVTable != NULL)
       *PaddingBytesAfterVTable =
@@ -167,21 +171,17 @@ int __llvm_profile_get_padding_sizes_for_counters(
     return 0;
   }
 
-  // Value profiling not supported in continuous mode at profile-write time.
-  // Return -1 to alert the incompatibility.
   if (VTableSize != 0 || VNameSize != 0)
     return -1;
 
-  // In continuous mode, the file offsets for headers and for the start of
-  // counter sections need to be page-aligned.
   *PaddingBytesBeforeCounters =
       calculateBytesNeededToPageAlign(sizeof(__llvm_profile_header) + DataSize);
   *PaddingBytesAfterCounters = calculateBytesNeededToPageAlign(CountersSize);
   *PaddingBytesAfterBitmapBytes =
       calculateBytesNeededToPageAlign(NumBitmapBytes);
+  if (PaddingBytesAfterUniformCounters != NULL)
+    *PaddingBytesAfterUniformCounters = 0;
   *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize);
-  // Set these two variables to zero to avoid uninitialized variables
-  // even if VTableSize and VNameSize are known to be zero.
   if (PaddingBytesAfterVTable != NULL)
     *PaddingBytesAfterVTable = 0;
   if (PaddingBytesAfterVName != NULL)
@@ -212,20 +212,22 @@ uint64_t __llvm_profile_get_size_for_buffer_internal(
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
       PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes,
-      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+      PaddingBytesAfterUniformCounters, PaddingBytesAfterVTable,
+      PaddingBytesAfterVNames;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NumBitmapBytes, NamesSize, 0 /* VTableSize */,
-      0 /* VNameSize */, &PaddingBytesBeforeCounters,
-      &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+      DataSize, CountersSize, NumBitmapBytes, 0 /* NumUniformCounters */,
+      NamesSize, 0 /* VTableSize */, 0 /* VNameSize */,
+      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterUniformCounters,
       &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
       &PaddingBytesAfterVNames);
 
   return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) +
          DataSize + PaddingBytesBeforeCounters + CountersSize +
          PaddingBytesAfterCounters + NumBitmapBytes +
-         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames +
-         VTableSize + PaddingBytesAfterVTable + VNameSize +
-         PaddingBytesAfterVNames;
+         PaddingBytesAfterBitmapBytes + PaddingBytesAfterUniformCounters +
+         NamesSize + PaddingBytesAfterNames + VTableSize +
+         PaddingBytesAfterVTable + VNameSize + PaddingBytesAfterVNames;
 }
 
 COMPILER_RT_VISIBILITY
@@ -250,7 +252,8 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal(
   // Set virtual table arguments to NULL since they are not supported yet.
   return lprofWriteDataImpl(
       &BufferWriter, DataBegin, DataEnd, CountersBegin, CountersEnd,
-      BitmapBegin, BitmapEnd, /*VPDataReader=*/0, NamesBegin, NamesEnd,
+      BitmapBegin, BitmapEnd, /*UniformCountersBegin=*/NULL,
+      /*UniformCountersEnd=*/NULL, /*VPDataReader=*/0, NamesBegin, NamesEnd,
       /*VTableBegin=*/NULL, /*VTableEnd=*/NULL, /*VNamesBegin=*/NULL,
       /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0,
       __llvm_profile_get_version());
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 71127b05aafb8..bbed1f540c8ac 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -151,11 +151,13 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
    * after the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
       PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes,
-      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+      PaddingBytesAfterUniformCounters, PaddingBytesAfterVTable,
+      PaddingBytesAfterVNames;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NumBitmapBytes, NamesSize, /*VTableSize=*/0,
-      /*VNameSize=*/0, &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames,
+      DataSize, CountersSize, NumBitmapBytes, 0 /* NumUniformCounters */,
+      NamesSize, /*VTableSize=*/0, /*VNameSize=*/0, &PaddingBytesBeforeCounters,
+      &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+      &PaddingBytesAfterUniformCounters, &PaddingBytesAfterNames,
       &PaddingBytesAfterVTable, &PaddingBytesAfterVNames);
 
   uint64_t PageAlignedCountersLength = CountersSize + PaddingBytesAfterCounters;
@@ -1198,6 +1200,8 @@ int __llvm_profile_write_file(void) {
   if (rc)
     PROF_ERR("Failed to write file \"%s\": %s\n", Filename, strerror(errno));
 
+  __llvm_profile_hip_collect_device_data();
+
   // Restore SIGKILL.
   if (PDeathSig == 1)
     lprofRestoreSigKill();
@@ -1282,14 +1286,16 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File,
 }
 
 #ifndef __APPLE__
-int __llvm_write_custom_profile(const char *Target,
+int __llvm_write_custom_profile(const char *Target, const char *TUSuffix,
                                 const __llvm_profile_data *DataBegin,
                                 const __llvm_profile_data *DataEnd,
                                 const char *CountersBegin,
-                                const char *CountersEnd, const char *NamesBegin,
-                                const char *NamesEnd,
+                                const char *CountersEnd,
+                                const char *UniformCountersBegin,
+                                const char *UniformCountersEnd,
+                                const char *NamesBegin, const char *NamesEnd,
                                 const uint64_t *VersionOverride) {
-  int ReturnValue = 0, FilenameLength, TargetLength;
+  int ReturnValue = 0, FilenameLength, TargetLength, TUSuffixLength;
   char *FilenameBuf, *TargetFilename;
   const char *Filename;
 
@@ -1307,7 +1313,8 @@ int __llvm_write_custom_profile(const char *Target,
   }
 
   /* Check if there is llvm/runtime version mismatch.  */
-  if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) {
+  if (VersionOverride == NULL &&
+      GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) {
     PROF_ERR("Runtime and instrumentation version mismatch : "
              "expected %d, but get %d\n",
              INSTR_PROF_RAW_VERSION,
@@ -1331,9 +1338,12 @@ int __llvm_write_custom_profile(const char *Target,
   }
 
   /* Allocate new space for our target-specific PGO filename */
+  /* Format: <dir>/<basename_without_ext>.<target>.<TUSuffix>.<ext> */
+  /* This matches the HIP convention for backward compatibility */
   TargetLength = strlen(Target);
-  TargetFilename =
-      (char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2);
+  TUSuffixLength = TUSuffix ? strlen(TUSuffix) : 0;
+  TargetFilename = (char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength +
+                                              TUSuffixLength + 3);
 
   /* Find file basename and path sizes */
   int32_t DirEnd = FilenameLength - 1;
@@ -1342,15 +1352,34 @@ int __llvm_write_custom_profile(const char *Target,
   }
   uint32_t DirSize = DirEnd + 1, BaseSize = FilenameLength - DirSize;
 
-  /* Prepend "TARGET." to current filename */
+  /* Find extension within basename */
+  const char *Basename = Filename + DirSize;
+  const char *Extension = strrchr(Basename, '.');
+  uint32_t BasenameNoExtSize =
+      Extension ? (uint32_t)(Extension - Basename) : BaseSize;
+  uint32_t ExtSize = Extension ? (uint32_t)(BaseSize - BasenameNoExtSize) : 0;
+
+  /* Build filename: <dir>/<basename_without_ext>.<target>.<TUSuffix>.<ext> */
+  char *p = TargetFilename;
   if (DirSize > 0) {
-    memcpy(TargetFilename, Filename, DirSize);
+    memcpy(p, Filename, DirSize);
+    p += DirSize;
+  }
+  memcpy(p, Basename, BasenameNoExtSize);
+  p += BasenameNoExtSize;
+  *p++ = '.';
+  memcpy(p, Target, TargetLength);
+  p += TargetLength;
+  if (TUSuffixLength > 0) {
+    *p++ = '.';
+    memcpy(p, TUSuffix, TUSuffixLength);
+    p += TUSuffixLength;
+  }
+  if (ExtSize > 0) {
+    memcpy(p, Extension, ExtSize);
+    p += ExtSize;
   }
-  memcpy(TargetFilename + DirSize, Target, TargetLength);
-  TargetFilename[TargetLength + DirSize] = '.';
-  memcpy(TargetFilename + DirSize + 1 + TargetLength, Filename + DirSize,
-         BaseSize);
-  TargetFilename[FilenameLength + 1 + TargetLength] = 0;
+  *p = '\0';
 
   /* Open and truncate target-specific PGO file */
   FILE *OutputFile = fopen(TargetFilename, "w");
@@ -1375,10 +1404,10 @@ int __llvm_write_custom_profile(const char *Target,
     Version = *VersionOverride;
 
   /* Write custom data to the file */
-  ReturnValue =
-      lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, CountersBegin,
-                         CountersEnd, NULL, NULL, lprofGetVPDataReader(), NULL,
-                         NULL, NULL, NULL, NamesBegin, NamesEnd, 0, Version);
+  ReturnValue = lprofWriteDataImpl(
+      &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL,
+      UniformCountersBegin, UniformCountersEnd, lprofGetVPDataReader(), NULL,
+      NULL, NULL, NULL, NamesBegin, NamesEnd, 0, Version);
   closeFileObject(OutputFile);
 
   // Restore SIGKILL.
diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h
index 5647782527eb7..42c4580c56bd9 100644
--- a/compiler-rt/lib/profile/InstrProfilingInternal.h
+++ b/compiler-rt/lib/profile/InstrProfilingInternal.h
@@ -152,16 +152,15 @@ typedef struct VPDataReaderType {
    the name data is already in destination, we just skip over it. */
 int lprofWriteData(ProfDataWriter *Writer, VPDataReaderType *VPDataReader,
                    int SkipNameDataWrite);
-int lprofWriteDataImpl(ProfDataWriter *Writer,
-                       const __llvm_profile_data *DataBegin,
-                       const __llvm_profile_data *DataEnd,
-                       const char *CountersBegin, const char *CountersEnd,
-                       const char *BitmapBegin, const char *BitmapEnd,
-                       VPDataReaderType *VPDataReader, const char *NamesBegin,
-                       const char *NamesEnd, const VTableProfData *VTableBegin,
-                       const VTableProfData *VTableEnd, const char *VNamesBegin,
-                       const char *VNamesEnd, int SkipNameDataWrite,
-                       uint64_t Version);
+int lprofWriteDataImpl(
+    ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
+    const __llvm_profile_data *DataEnd, const char *CountersBegin,
+    const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
+    const char *UniformCountersBegin, const char *UniformCountersEnd,
+    VPDataReaderType *VPDataReader, const char *NamesBegin,
+    const char *NamesEnd, const VTableProfData *VTableBegin,
+    const VTableProfData *VTableEnd, const char *VNamesBegin,
+    const char *VNamesEnd, int SkipNameDataWrite, uint64_t Version);
 
 /* Merge value profile data pointed to by SrcValueProfData into
  * in-memory profile counters pointed by to DstData.  */
@@ -212,5 +211,12 @@ int __llvm_write_binary_ids(ProfDataWriter *Writer);
 int lprofWriteOneBinaryId(ProfDataWriter *Writer, uint64_t BinaryIdLen,
                           const uint8_t *BinaryIdData,
                           uint64_t BinaryIdPadding);
+#ifdef __cplusplus
+extern "C" {
+#endif
+COMPILER_RT_VISIBILITY int __llvm_profile_hip_collect_device_data(void);
+#ifdef __cplusplus
+}
+#endif
 
 #endif
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
index ab7031343c855..e6a1395042107 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
@@ -16,6 +16,7 @@
 
 #include "InstrProfiling.h"
 #include <gpuintrin.h>
+#include <stdint.h>
 
 // Symbols exported to the GPU runtime need to be visible in the .dynsym table.
 #define COMPILER_RT_GPU_VISIBILITY __attribute__((visibility("protected")))
@@ -42,7 +43,30 @@ COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter,
   }
 }
 
+// Block-level sampling for offload PGO. For GPU kernels with stationary
+// behavior (where all thread blocks execute the same code paths regardless of
+// block ID), partial sampling significantly reduces instrumentation overhead
+// without losing PGO performance gains.
+//
+// Returns 1 if this block should be instrumented, 0 to skip. Samples by
+// matching lower bits of the linearized 3D block ID to zero.
+//   sampling_bits=0: all blocks (100%)
+//   sampling_bits=3: every 8th block (12.5%, default)
+COMPILER_RT_VISIBILITY int __llvm_profile_sampling_gpu(uint32_t sampling_bits) {
+  if (sampling_bits == 0)
+    return 1;
+
+  uint32_t gdx = __gpu_num_blocks_x();
+  uint32_t gdy = __gpu_num_blocks_y();
+  uint32_t block_id = __gpu_block_id_x() + __gpu_block_id_y() * gdx +
+                      __gpu_block_id_z() * gdx * gdy;
+
+  uint32_t mask = (1u << sampling_bits) - 1;
+  return (block_id & mask) == 0;
+}
+
 #if defined(__AMDGPU__)
+__attribute__((weak)) const int __oclc_ABI_version = 600;
 
 #define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
 #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
@@ -50,6 +74,8 @@ COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter,
 #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
 #define PROF_DATA_START INSTR_PROF_SECT_START(INSTR_PROF_DATA_COMMON)
 #define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
+#define PROF_UCNTS_START INSTR_PROF_SECT_START(INSTR_PROF_UCNTS_COMMON)
+#define PROF_UCNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_UCNTS_COMMON)
 
 extern char PROF_NAME_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_NAME_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
@@ -59,28 +85,22 @@ extern __llvm_profile_data PROF_DATA_START[] COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
 extern __llvm_profile_data PROF_DATA_STOP[] COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
+extern char PROF_UCNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_UCNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 
-// AMDGPU is a proper ELF target and exports the linker-defined section bounds.
 COMPILER_RT_GPU_VISIBILITY
 __llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
-    PROF_NAME_START,
-    PROF_NAME_STOP,
-    PROF_CNTS_START,
-    PROF_CNTS_STOP,
-    PROF_DATA_START,
-    PROF_DATA_STOP,
-    &INSTR_PROF_RAW_VERSION_VAR};
+    PROF_NAME_START,  PROF_NAME_STOP,  PROF_CNTS_START,
+    PROF_CNTS_STOP,   PROF_DATA_START, PROF_DATA_STOP,
+    PROF_UCNTS_START, PROF_UCNTS_STOP, &INSTR_PROF_RAW_VERSION_VAR};
 
 #elif defined(__NVPTX__)
 
-// NVPTX supports neither sections nor ELF symbols, we rely on the handling in
-// the 'InstrProfilingPlatformOther.c' file to fill this at initialization time.
-// FIXME: This will not work until we make the NVPTX backend emit section
-//        globals next to each other.
 COMPILER_RT_GPU_VISIBILITY
 __llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
-    NULL, NULL, NULL, NULL, NULL, NULL, &INSTR_PROF_RAW_VERSION_VAR};
-
+    NULL, NULL, NULL,
+    NULL, NULL, NULL,
+    NULL, NULL, &INSTR_PROF_RAW_VERSION_VAR};
 #endif
 
 #endif
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
new file mode 100644
index 0000000000000..fed0949345c7e
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
@@ -0,0 +1,915 @@
+//===- InstrProfilingPlatformROCm.c - Profile data ROCm platform ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingPort.h"
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* -------------------------------------------------------------------------- */
+/*  Platform abstraction for dynamic library loading                          */
+/* -------------------------------------------------------------------------- */
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+typedef HMODULE DylibHandle;
+
+static DylibHandle DylibOpen(const char *Name) { return LoadLibraryA(Name); }
+
+static void *DylibSym(DylibHandle H, const char *Sym) {
+  return (void *)(uintptr_t)GetProcAddress(H, Sym);
+}
+
+static int DylibAvailable(void) { return 1; }
+
+#else /* POSIX */
+#include <dlfcn.h>
+
+/* Use weak references for dl* functions to avoid requiring -ldl at link time.
+ *
+ * The profile runtime is a static library, so its dependencies must be
+ * explicitly linked by the user. Unlike sanitizer runtimes (which are often
+ * shared libraries with their own dependencies), adding -ldl globally would
+ * affect all profiling users, including those not using HIP/ROCm.
+ *
+ * With weak references:
+ * - Programs without -ldl link successfully (dl* resolve to NULL)
+ * - HIP programs get -ldl from the HIP runtime, so dl* work normally
+ * - OpenMP offload programs without HIP gracefully skip device profiling
+ */
+#pragma weak dlopen
+#pragma weak dlsym
+#pragma weak dlerror
+
+typedef void *DylibHandle;
+
+static DylibHandle DylibOpen(const char *Name) {
+  return dlopen(Name, RTLD_LAZY | RTLD_LOCAL);
+}
+
+static void *DylibSym(DylibHandle H, const char *Sym) { return dlsym(H, Sym); }
+
+static int DylibAvailable(void) { return dlopen != NULL; }
+
+#endif /* _WIN32 */
+
+static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
+                                   const char *Target);
+
+static int IsVerboseMode() {
+  static int IsVerbose = -1;
+  if (IsVerbose == -1)
+    IsVerbose = getenv("LLVM_PROFILE_VERBOSE") != NULL;
+  return IsVerbose;
+}
+
+/* -------------------------------------------------------------------------- */
+/*  Dynamic loading of HIP runtime symbols                                   */
+/* -------------------------------------------------------------------------- */
+
+typedef int (*hipGetSymbolAddressTy)(void **, const void *);
+typedef int (*hipMemcpyTy)(void *, const void *, size_t, int);
+typedef int (*hipModuleGetGlobalTy)(void **, size_t *, void *, const char *);
+typedef int (*hipGetDeviceCountTy)(int *);
+typedef int (*hipGetDeviceTy)(int *);
+typedef int (*hipSetDeviceTy)(int);
+
+/* hipDeviceProp_t layout for HIP 6.x+ (R0600).
+ * We only need gcnArchName at offset 1160. Pad to 4096 to safely
+ * accommodate future struct growth without recompilation. */
+typedef struct {
+  char padding[1160];
+  char gcnArchName[256];
+  char tail_padding[2680];
+} HipDevicePropMinimal;
+typedef int (*hipGetDevicePropertiesTy)(HipDevicePropMinimal *, int);
+
+static hipGetSymbolAddressTy pHipGetSymbolAddress = NULL;
+static hipMemcpyTy pHipMemcpy = NULL;
+static hipModuleGetGlobalTy pHipModuleGetGlobal = NULL;
+static hipGetDeviceCountTy pHipGetDeviceCount = NULL;
+static hipGetDeviceTy pHipGetDevice = NULL;
+static hipSetDeviceTy pHipSetDevice = NULL;
+static hipGetDevicePropertiesTy pHipGetDeviceProperties = NULL;
+
+#define MAX_DEVICES 16
+static int NumDevices = 0;
+static char DeviceArchNames[MAX_DEVICES][256];
+
+/* -------------------------------------------------------------------------- */
+/*  Device-to-host copies                                                     */
+/*  Keep HIP-only to avoid an HSA dependency.                                 */
+/* -------------------------------------------------------------------------- */
+
+static void EnsureHipLoaded(void) {
+  static int Initialized = 0;
+  if (Initialized)
+    return;
+  Initialized = 1;
+
+  if (!DylibAvailable()) {
+    if (IsVerboseMode())
+      PROF_NOTE("%s", "Dynamic library loading not available - "
+                      "HIP profiling disabled\n");
+    return;
+  }
+
+#ifdef _WIN32
+  static const char HipLibName[] = "amdhip64.dll";
+#else
+  static const char HipLibName[] = "libamdhip64.so";
+#endif
+
+  DylibHandle Handle = DylibOpen(HipLibName);
+  if (!Handle) {
+#ifndef _WIN32
+    if (dlerror)
+      fprintf(stderr, "compiler-rt: failed to open %s: %s\n", HipLibName,
+              dlerror());
+#endif
+    return;
+  }
+
+  pHipGetSymbolAddress =
+      (hipGetSymbolAddressTy)DylibSym(Handle, "hipGetSymbolAddress");
+  pHipMemcpy = (hipMemcpyTy)DylibSym(Handle, "hipMemcpy");
+  pHipModuleGetGlobal =
+      (hipModuleGetGlobalTy)DylibSym(Handle, "hipModuleGetGlobal");
+  pHipGetDeviceCount =
+      (hipGetDeviceCountTy)DylibSym(Handle, "hipGetDeviceCount");
+  pHipGetDevice = (hipGetDeviceTy)DylibSym(Handle, "hipGetDevice");
+  pHipSetDevice = (hipSetDeviceTy)DylibSym(Handle, "hipSetDevice");
+  pHipGetDeviceProperties =
+      (hipGetDevicePropertiesTy)DylibSym(Handle, "hipGetDevicePropertiesR0600");
+  if (!pHipGetDeviceProperties)
+    pHipGetDeviceProperties =
+        (hipGetDevicePropertiesTy)DylibSym(Handle, "hipGetDeviceProperties");
+
+  if (pHipGetDeviceCount && pHipGetDeviceProperties) {
+    int Count = 0;
+    if (pHipGetDeviceCount(&Count) == 0) {
+      if (Count > MAX_DEVICES)
+        Count = MAX_DEVICES;
+      HipDevicePropMinimal Prop;
+      for (int i = 0; i < Count; ++i) {
+        memset(&Prop, 0, sizeof(Prop));
+        if (pHipGetDeviceProperties(&Prop, i) == 0) {
+          strncpy(DeviceArchNames[i], Prop.gcnArchName,
+                  sizeof(DeviceArchNames[i]) - 1);
+          DeviceArchNames[i][sizeof(DeviceArchNames[i]) - 1] = '\0';
+          if (IsVerboseMode())
+            PROF_NOTE("Device %d arch: %s\n", i, DeviceArchNames[i]);
+        }
+      }
+      NumDevices = Count;
+    }
+  }
+}
+
+/* -------------------------------------------------------------------------- */
+/*  Public wrappers that forward to the loaded HIP symbols                   */
+/* -------------------------------------------------------------------------- */
+
+static int hipGetSymbolAddress(void **devPtr, const void *symbol) {
+  EnsureHipLoaded();
+  return pHipGetSymbolAddress ? pHipGetSymbolAddress(devPtr, symbol) : -1;
+}
+
+static int hipMemcpy(void *dest, const void *src, size_t len,
+                     int kind /*2=DToH*/) {
+  EnsureHipLoaded();
+  return pHipMemcpy ? pHipMemcpy(dest, src, len, kind) : -1;
+}
+
+/* Copy from device to host using HIP.
+ * This requires that the device section symbols are registered with CLR,
+ * otherwise hipMemcpy may attempt a CPU path and crash. */
+static int memcpyDeviceToHost(void *Dst, const void *Src, size_t Size) {
+  return hipMemcpy(Dst, Src, Size, 2 /* DToH */);
+}
+
+static int hipModuleGetGlobal(void **DevPtr, size_t *Bytes, void *Module,
+                              const char *Name) {
+  EnsureHipLoaded();
+  return pHipModuleGetGlobal ? pHipModuleGetGlobal(DevPtr, Bytes, Module, Name)
+                             : -1;
+}
+
+static int hipGetDevice(int *DeviceId) {
+  EnsureHipLoaded();
+  return pHipGetDevice ? pHipGetDevice(DeviceId) : -1;
+}
+
+static int hipSetDevice(int DeviceId) {
+  EnsureHipLoaded();
+  return pHipSetDevice ? pHipSetDevice(DeviceId) : -1;
+}
+
+static const char *getDeviceArchName(int DeviceId) {
+  if (DeviceId < 0 || DeviceId >= NumDevices || !DeviceArchNames[DeviceId][0])
+    return "amdgpu";
+  return DeviceArchNames[DeviceId];
+}
+
+/* -------------------------------------------------------------------------- */
+/*  Dynamic module tracking                                                   */
+/* -------------------------------------------------------------------------- */
+
+/* Per-TU profile entry inside a dynamic module.
+ * A single dynamic module may contain multiple TUs (e.g. -fgpu-rdc). */
+typedef struct {
+  void *DeviceVar; /* device address of __llvm_offload_prf_<CUID>      */
+  int Processed;   /* 0 = not yet collected, 1 = data already copied   */
+} OffloadDynamicTUInfo;
+
+/* One entry per hipModuleLoad call. */
+typedef struct {
+  void *ModulePtr;           /* hipModule_t handle                        */
+  OffloadDynamicTUInfo *TUs; /* array of per-TU entries                 */
+  int NumTUs;
+  int CapTUs;
+} OffloadDynamicModuleInfo;
+
+static OffloadDynamicModuleInfo *DynamicModules = NULL;
+static int NumDynamicModules = 0;
+static int CapDynamicModules = 0;
+
+/* -------------------------------------------------------------------------- */
+/*  ELF symbol enumeration (Linux only)                                       */
+/*                                                                            */
+/*  AMDGPU code objects are always ELF, but <elf.h> is a Linux system header. */
+/*  Dynamic module profiling (hipModuleLoadData) is currently Linux-only. */
+/* -------------------------------------------------------------------------- */
+
+#if defined(__linux__)
+#include <elf.h>
+
+/* Callback invoked for every matching symbol name found in the ELF image.
+ * Return 0 to continue iteration, non-zero to stop. */
+typedef int (*SymbolCallback)(const char *Name, void *UserData);
+
+/* If Image is a clang offload bundle (__CLANG_OFFLOAD_BUNDLE__), find the
+ * first embedded code object that is a valid ELF and return a pointer to it.
+ * Otherwise return Image unchanged. Returns NULL if no ELF is found. */
+static const void *UnwrapOffloadBundle(const void *Image) {
+  static const char BundleMagic[] = "__CLANG_OFFLOAD_BUNDLE__";
+  if (memcmp(Image, BundleMagic, 24) != 0)
+    return Image; /* Not a bundle, return as-is. */
+
+  const char *Buf = (const char *)Image;
+  uint64_t NumEntries;
+  memcpy(&NumEntries, Buf + 24, sizeof(uint64_t));
+
+  /* Walk the entry table (starts at offset 32). */
+  const char *Cursor = Buf + 32;
+  for (uint64_t I = 0; I < NumEntries; ++I) {
+    uint64_t EntryOffset, EntrySize, IDSize;
+    memcpy(&EntryOffset, Cursor, 8);
+    Cursor += 8;
+    memcpy(&EntrySize, Cursor, 8);
+    Cursor += 8;
+    memcpy(&IDSize, Cursor, 8);
+    Cursor += 8;
+    /* Skip the entry ID string. */
+    Cursor += IDSize;
+
+    /* Check if this entry contains an ELF. */
+    if (EntrySize >= sizeof(Elf64_Ehdr)) {
+      const Elf64_Ehdr *E = (const Elf64_Ehdr *)(Buf + EntryOffset);
+      if (E->e_ident[EI_MAG0] == ELFMAG0 && E->e_ident[EI_MAG1] == ELFMAG1 &&
+          E->e_ident[EI_MAG2] == ELFMAG2 && E->e_ident[EI_MAG3] == ELFMAG3) {
+        return (const void *)(Buf + EntryOffset);
+      }
+    }
+  }
+
+  PROF_WARN("%s", "Offload bundle contains no valid ELF entries\n");
+  return NULL;
+}
+
+/* Parse an AMDGPU code-object ELF and invoke CB for every global symbol whose
+ * name starts with PREFIX.  Image may be NULL (e.g. hipModuleLoad from file)
+ * or a clang offload bundle containing an ELF;
+ * in that case the function unwraps the bundle first. */
+static void EnumerateElfSymbols(const void *Image, const char *Prefix,
+                                SymbolCallback CB, void *UserData) {
+  if (!Image)
+    return;
+
+  /* Handle clang offload bundle wrapping. */
+  Image = UnwrapOffloadBundle(Image);
+  if (!Image)
+    return;
+
+  const Elf64_Ehdr *Ehdr = (const Elf64_Ehdr *)Image;
+  if (Ehdr->e_ident[EI_MAG0] != ELFMAG0 || Ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+      Ehdr->e_ident[EI_MAG2] != ELFMAG2 || Ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+    if (IsVerboseMode())
+      PROF_NOTE("%s", "Image is not a valid ELF, skipping enumeration\n");
+    return;
+  }
+
+  size_t PrefixLen = strlen(Prefix);
+  const char *Base = (const char *)Image;
+  const Elf64_Shdr *Shdrs = (const Elf64_Shdr *)(Base + Ehdr->e_shoff);
+
+  for (int i = 0; i < Ehdr->e_shnum; ++i) {
+    if (Shdrs[i].sh_type != SHT_SYMTAB)
+      continue;
+
+    const Elf64_Sym *Syms = (const Elf64_Sym *)(Base + Shdrs[i].sh_offset);
+    int NumSyms = Shdrs[i].sh_size / sizeof(Elf64_Sym);
+    /* String table is the section referenced by sh_link. */
+    const char *StrTab = Base + Shdrs[Shdrs[i].sh_link].sh_offset;
+
+    for (int j = 0; j < NumSyms; ++j) {
+      if (Syms[j].st_name == 0)
+        continue;
+      const char *Name = StrTab + Syms[j].st_name;
+      if (strncmp(Name, Prefix, PrefixLen) == 0) {
+        if (CB(Name, UserData))
+          return;
+      }
+    }
+  }
+}
+
+/* State passed through the enumeration callback. */
+typedef struct {
+  void *Module; /* hipModule_t */
+  OffloadDynamicModuleInfo *ModInfo;
+} EnumState;
+
+/* Grow the TU array inside a module entry and register one __llvm_offload_prf_*
+ * symbol. Also pre-registers the corresponding per-TU section symbols with CLR
+ * (needed so hipMemcpy can copy from those device addresses later). */
+static int RegisterPrfSymbol(const char *Name, void *UserData) {
+  EnumState *S = (EnumState *)UserData;
+  OffloadDynamicModuleInfo *MI = S->ModInfo;
+
+  /* Look up the per-TU pointer variable, then dereference to get the
+   * address of __llvm_profile_sections. */
+  void *DevicePtrVar = NULL;
+  size_t Bytes = 0;
+  if (hipModuleGetGlobal(&DevicePtrVar, &Bytes, S->Module, Name) != 0) {
+    PROF_WARN("Failed to get symbol %s for module %p\n", Name, S->Module);
+    return 0; /* continue */
+  }
+  void *DeviceVar = NULL;
+  if (hipMemcpy(&DeviceVar, DevicePtrVar, sizeof(void *), 2 /*DToH*/) != 0) {
+    PROF_WARN("Failed to read sections pointer for %s\n", Name);
+    return 0;
+  }
+
+  /* Grow TU array if needed. */
+  if (MI->NumTUs >= MI->CapTUs) {
+    int NewCap = MI->CapTUs ? MI->CapTUs * 2 : 4;
+    OffloadDynamicTUInfo *New = (OffloadDynamicTUInfo *)realloc(
+        MI->TUs, NewCap * sizeof(OffloadDynamicTUInfo));
+    if (!New) {
+      PROF_ERR("%s\n", "Failed to grow TU array");
+      return 0;
+    }
+    MI->TUs = New;
+    MI->CapTUs = NewCap;
+  }
+  OffloadDynamicTUInfo *TU = &MI->TUs[MI->NumTUs++];
+  TU->DeviceVar = DeviceVar;
+  TU->Processed = 0;
+
+  (void)Name; /* CUID suffix available for future per-TU section lookup */
+
+  return 0; /* continue enumeration */
+}
+
+#endif /* defined(__linux__) */
+
+/* -------------------------------------------------------------------------- */
+/*  Registration / un-registration helpers                                   */
+/* -------------------------------------------------------------------------- */
+
+void __llvm_profile_offload_register_dynamic_module(int ModuleLoadRc,
+                                                    void **Ptr,
+                                                    const void *Image) {
+  if (IsVerboseMode())
+    PROF_NOTE("Registering loaded module %d: rc=%d, module=%p, image=%p\n",
+              NumDynamicModules, ModuleLoadRc, *Ptr, Image);
+
+  if (ModuleLoadRc)
+    return;
+
+  if (NumDynamicModules >= CapDynamicModules) {
+    int NewCap = CapDynamicModules ? CapDynamicModules * 2 : 64;
+    OffloadDynamicModuleInfo *New = (OffloadDynamicModuleInfo *)realloc(
+        DynamicModules, NewCap * sizeof(OffloadDynamicModuleInfo));
+    if (!New)
+      return;
+    DynamicModules = New;
+    CapDynamicModules = NewCap;
+  }
+
+  OffloadDynamicModuleInfo *MI = &DynamicModules[NumDynamicModules++];
+  MI->ModulePtr = *Ptr;
+  MI->TUs = NULL;
+  MI->NumTUs = 0;
+  MI->CapTUs = 0;
+
+  /* Enumerate all __llvm_offload_prf_<CUID> symbols in the ELF image.
+   * For each one, look it up via hipModuleGetGlobal (which also registers
+   * the device address with CLR for later hipMemcpy) and store the entry.
+   *
+   * ELF parsing requires <elf.h> which is Linux-only.  On other platforms,
+   * dynamic module profiling is not yet supported. */
+#if defined(__linux__)
+  EnumState State = {*Ptr, MI};
+  EnumerateElfSymbols(Image, "__llvm_offload_prf_", RegisterPrfSymbol, &State);
+#else
+  (void)Image;
+  if (IsVerboseMode())
+    PROF_NOTE("%s",
+              "Dynamic module profiling not supported on this platform\n");
+#endif
+
+  if (MI->NumTUs == 0) {
+    PROF_WARN("No __llvm_offload_prf_* symbols found in module %p\n", *Ptr);
+  } else if (IsVerboseMode()) {
+    PROF_NOTE("Module %p: registered %d TU(s)\n", *Ptr, MI->NumTUs);
+  }
+}
+
+void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) {
+  for (int i = 0; i < NumDynamicModules; ++i) {
+    OffloadDynamicModuleInfo *MI = &DynamicModules[i];
+
+    if (MI->ModulePtr != Ptr)
+      continue;
+
+    if (IsVerboseMode())
+      PROF_NOTE("Unregistering module %p (%d TUs)\n", MI->ModulePtr,
+                MI->NumTUs);
+
+    /* Process every TU in this module. */
+    for (int t = 0; t < MI->NumTUs; ++t) {
+      OffloadDynamicTUInfo *TU = &MI->TUs[t];
+      if (TU->Processed) {
+        if (IsVerboseMode())
+          PROF_NOTE("Module %p TU %d already processed, skipping\n", Ptr, t);
+        continue;
+      }
+      /* Use a globally unique index as TU index for the output filename. */
+      int TUIndex = i * 1000 + t;
+      if (TU->DeviceVar) {
+        int CurDev = 0;
+        hipGetDevice(&CurDev);
+        const char *ArchName = getDeviceArchName(CurDev);
+        if (ProcessDeviceOffloadPrf(TU->DeviceVar, TUIndex, ArchName) == 0)
+          TU->Processed = 1;
+        else
+          PROF_WARN("Failed to process profile data for module %p TU %d\n", Ptr,
+                    t);
+      }
+    }
+    return;
+  }
+
+  if (IsVerboseMode())
+    PROF_WARN("Unregister called for unknown module %p\n", Ptr);
+}
+
+/* Grow a void* array, doubling capacity (or starting at InitCap). */
+static int GrowPtrArray(void ***Arr, int *Num, int *Cap, int InitCap) {
+  if (*Num < *Cap)
+    return 0;
+  int NewCap = *Cap ? *Cap * 2 : InitCap;
+  void **New = (void **)realloc(*Arr, NewCap * sizeof(void *));
+  if (!New)
+    return -1;
+  *Arr = New;
+  *Cap = NewCap;
+  return 0;
+}
+
+static void **OffloadShadowVariables = NULL;
+static int NumShadowVariables = 0;
+static int CapShadowVariables = 0;
+
+void __llvm_profile_offload_register_shadow_variable(void *ptr) {
+  if (GrowPtrArray(&OffloadShadowVariables, &NumShadowVariables,
+                   &CapShadowVariables, 64))
+    return;
+  OffloadShadowVariables[NumShadowVariables++] = ptr;
+}
+
+static void **OffloadSectionShadowVariables = NULL;
+static int NumSectionShadowVariables = 0;
+static int CapSectionShadowVariables = 0;
+
+void __llvm_profile_offload_register_section_shadow_variable(void *ptr) {
+  if (GrowPtrArray(&OffloadSectionShadowVariables, &NumSectionShadowVariables,
+                   &CapSectionShadowVariables, 64))
+    return;
+  OffloadSectionShadowVariables[NumSectionShadowVariables++] = ptr;
+}
+
+static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
+                                   const char *Target) {
+  __llvm_profile_gpu_sections HostSections;
+
+  if (hipMemcpy(&HostSections, DeviceOffloadPrf, sizeof(HostSections),
+                2 /*DToH*/) != 0) {
+    PROF_ERR("%s\n", "Failed to copy offload prf structure from device");
+    return -1;
+  }
+
+  const void *DevCntsBegin = HostSections.CountersStart;
+  const void *DevDataBegin = HostSections.DataStart;
+  const void *DevNamesBegin = HostSections.NamesStart;
+  const void *DevUniformCntsBegin = HostSections.UniformCountersStart;
+  const void *DevCntsEnd = HostSections.CountersStop;
+  const void *DevDataEnd = HostSections.DataStop;
+  const void *DevNamesEnd = HostSections.NamesStop;
+  const void *DevUniformCntsEnd = HostSections.UniformCountersStop;
+
+  size_t CountersSize = (const char *)DevCntsEnd - (const char *)DevCntsBegin;
+  size_t DataSize = (const char *)DevDataEnd - (const char *)DevDataBegin;
+  size_t NamesSize = (const char *)DevNamesEnd - (const char *)DevNamesBegin;
+  size_t UniformCountersSize =
+      (const char *)DevUniformCntsEnd - (const char *)DevUniformCntsBegin;
+
+  if (IsVerboseMode())
+    PROF_NOTE("Section pointers: Cnts=[%p,%p]=%zu Data=[%p,%p]=%zu "
+              "Names=[%p,%p]=%zu UCnts=[%p,%p]=%zu\n",
+              DevCntsBegin, DevCntsEnd, CountersSize, DevDataBegin, DevDataEnd,
+              DataSize, DevNamesBegin, DevNamesEnd, NamesSize,
+              DevUniformCntsBegin, DevUniformCntsEnd, UniformCountersSize);
+
+  if (CountersSize == 0 || DataSize == 0)
+    return 0;
+
+  int ret = -1;
+  int NamesReused = 0, CntsReused = 0, UCntsReused = 0, DataReused = 0;
+
+  char *HostDataBegin = NULL;
+  char *HostCountersBegin = NULL;
+  char *HostNamesBegin = NULL;
+  char *HostUniformCountersBegin = NULL;
+
+  /* Sections using linker-defined __start_/__stop_ bounds are shared across
+     TU structs in RDC mode. Deduplicate by caching the last copied range. */
+  static const void *CachedDevNamesBegin = NULL;
+  static char *CachedHostNames = NULL;
+  static size_t CachedNamesSize = 0;
+
+  static const void *CachedDevCntsBegin = NULL;
+  static char *CachedHostCnts = NULL;
+  static size_t CachedCntsSize = 0;
+
+  static const void *CachedDevDataBegin = NULL;
+  static char *CachedHostData = NULL;
+  static size_t CachedDataSize = 0;
+
+  static const void *CachedDevUCntsBegin = NULL;
+  static char *CachedHostUCnts = NULL;
+  static size_t CachedUCntsSize = 0;
+
+  if (CountersSize > 0 && DevCntsBegin == CachedDevCntsBegin &&
+      CountersSize == CachedCntsSize) {
+    HostCountersBegin = CachedHostCnts;
+    CntsReused = 1;
+    if (IsVerboseMode())
+      PROF_NOTE("Reusing cached counters section (%zu bytes)\n", CountersSize);
+  } else if (CountersSize > 0) {
+    HostCountersBegin = (char *)malloc(CountersSize);
+  }
+
+  if (DataSize > 0 && DevDataBegin == CachedDevDataBegin &&
+      DataSize == CachedDataSize) {
+    HostDataBegin = CachedHostData;
+    DataReused = 1;
+    if (IsVerboseMode())
+      PROF_NOTE("Reusing cached data section (%zu bytes)\n", DataSize);
+  } else if (DataSize > 0) {
+    HostDataBegin = (char *)malloc(DataSize);
+  }
+
+  if (NamesSize > 0 && DevNamesBegin == CachedDevNamesBegin &&
+      NamesSize == CachedNamesSize) {
+    HostNamesBegin = CachedHostNames;
+    NamesReused = 1;
+    if (IsVerboseMode())
+      PROF_NOTE("Reusing cached names section (%zu bytes)\n", NamesSize);
+  } else if (NamesSize > 0) {
+    HostNamesBegin = (char *)malloc(NamesSize);
+  }
+
+  if (UniformCountersSize > 0 && DevUniformCntsBegin == CachedDevUCntsBegin &&
+      UniformCountersSize == CachedUCntsSize) {
+    HostUniformCountersBegin = CachedHostUCnts;
+    UCntsReused = 1;
+    if (IsVerboseMode())
+      PROF_NOTE("Reusing cached ucnts section (%zu bytes)\n",
+                UniformCountersSize);
+  } else if (UniformCountersSize > 0) {
+    HostUniformCountersBegin = (char *)malloc(UniformCountersSize);
+  }
+
+  if ((DataSize > 0 && !HostDataBegin) ||
+      (CountersSize > 0 && !HostCountersBegin) ||
+      (NamesSize > 0 && !HostNamesBegin) ||
+      (UniformCountersSize > 0 && !HostUniformCountersBegin)) {
+    PROF_ERR("%s\n", "Failed to allocate host memory for device sections");
+    goto cleanup;
+  }
+
+  if ((DataSize > 0 && !DataReused &&
+       memcpyDeviceToHost(HostDataBegin, DevDataBegin, DataSize) != 0) ||
+      (CountersSize > 0 && !CntsReused &&
+       memcpyDeviceToHost(HostCountersBegin, DevCntsBegin, CountersSize) !=
+           0) ||
+      (NamesSize > 0 && !NamesReused &&
+       memcpyDeviceToHost(HostNamesBegin, DevNamesBegin, NamesSize) != 0) ||
+      (UniformCountersSize > 0 && !UCntsReused &&
+       memcpyDeviceToHost(HostUniformCountersBegin, DevUniformCntsBegin,
+                          UniformCountersSize) != 0)) {
+    PROF_ERR("%s\n", "Failed to copy profile sections from device");
+    goto cleanup;
+  }
+
+  if (!CntsReused && CountersSize > 0) {
+    CachedDevCntsBegin = DevCntsBegin;
+    CachedHostCnts = HostCountersBegin;
+    CachedCntsSize = CountersSize;
+  }
+  if (!DataReused && DataSize > 0) {
+    CachedDevDataBegin = DevDataBegin;
+    CachedHostData = HostDataBegin;
+    CachedDataSize = DataSize;
+  }
+  if (!NamesReused && NamesSize > 0) {
+    CachedDevNamesBegin = DevNamesBegin;
+    CachedHostNames = HostNamesBegin;
+    CachedNamesSize = NamesSize;
+  }
+  if (!UCntsReused && UniformCountersSize > 0) {
+    CachedDevUCntsBegin = DevUniformCntsBegin;
+    CachedHostUCnts = HostUniformCountersBegin;
+    CachedUCntsSize = UniformCountersSize;
+  }
+
+  if (IsVerboseMode())
+    PROF_NOTE("Copied device sections: Counters=%zu, Data=%zu, Names=%zu, "
+              "UniformCounters=%zu\n",
+              CountersSize, DataSize, NamesSize, UniformCountersSize);
+
+  // Arrange buffer as [Data][Padding][Counters][Names] to match the layout
+  // expected by lprofWriteDataImpl (CountersDelta = CountersBegin - DataBegin).
+  const uint64_t NumData = DataSize / sizeof(__llvm_profile_data);
+  const uint64_t NumBitmapBytes = 0;
+  const uint64_t NumUniformCounters = UniformCountersSize / sizeof(uint64_t);
+  const uint64_t VTableSectionSize = 0;
+  const uint64_t VNamesSize = 0;
+  uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
+      PaddingBytesAfterBitmapBytes, PaddingBytesAfterUniformCounters,
+      PaddingBytesAfterNames, PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+
+  if (__llvm_profile_get_padding_sizes_for_counters(
+          DataSize, CountersSize, NumBitmapBytes, NumUniformCounters, NamesSize,
+          VTableSectionSize, VNamesSize, &PaddingBytesBeforeCounters,
+          &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+          &PaddingBytesAfterUniformCounters, &PaddingBytesAfterNames,
+          &PaddingBytesAfterVTable, &PaddingBytesAfterVNames) != 0) {
+    PROF_ERR("%s\n", "Failed to get padding sizes");
+    goto cleanup;
+  }
+
+  size_t ContiguousBufferSize =
+      DataSize + PaddingBytesBeforeCounters + CountersSize + NamesSize;
+  char *ContiguousBuffer = (char *)malloc(ContiguousBufferSize);
+  if (!ContiguousBuffer) {
+    PROF_ERR("%s\n", "Failed to allocate contiguous buffer");
+    goto cleanup;
+  }
+  memset(ContiguousBuffer, 0, ContiguousBufferSize);
+
+  char *BufDataBegin = ContiguousBuffer;
+  char *BufCountersBegin =
+      ContiguousBuffer + DataSize + PaddingBytesBeforeCounters;
+  char *BufNamesBegin = BufCountersBegin + CountersSize;
+
+  memcpy(BufDataBegin, HostDataBegin, DataSize);
+  memcpy(BufCountersBegin, HostCountersBegin, CountersSize);
+  memcpy(BufNamesBegin, HostNamesBegin, NamesSize);
+
+  // Reorder uniform counters to match data record order. With per-function
+  // counter layout, the linker may reorder __llvm_prf_cnts and
+  // __llvm_prf_ucnts independently. Use UniformCounterPtr from each data
+  // record to extract the right uniform counters in data-record order,
+  // so that llvm-profdata's sequential walk works correctly.
+  char *ReorderedUniformCounters = NULL;
+  size_t ReorderedUniformSize = 0;
+  __llvm_profile_data *RawData = (__llvm_profile_data *)HostDataBegin;
+  if (HostUniformCountersBegin && NumData > 0) {
+    size_t TotalCounters = 0;
+    for (uint64_t i = 0; i < NumData; ++i)
+      TotalCounters += RawData[i].NumCounters;
+    ReorderedUniformSize = TotalCounters * sizeof(uint64_t);
+    ReorderedUniformCounters = (char *)calloc(1, ReorderedUniformSize);
+    if (ReorderedUniformCounters) {
+      size_t DstOffset = 0;
+      for (uint64_t i = 0; i < NumData; ++i) {
+        uint32_t NC = RawData[i].NumCounters;
+        ptrdiff_t UCPtrOff = (ptrdiff_t)RawData[i].UniformCounterPtr;
+        if (NC > 0 && UCPtrOff != 0) {
+          const char *DevDataAddr =
+              (const char *)DevDataBegin + (i * sizeof(__llvm_profile_data));
+          ptrdiff_t SrcOff =
+              (DevDataAddr + UCPtrOff) - (const char *)DevUniformCntsBegin;
+          if (SrcOff >= 0 &&
+              (size_t)(SrcOff + NC * sizeof(uint64_t)) <= UniformCountersSize)
+            memcpy(ReorderedUniformCounters + DstOffset,
+                   HostUniformCountersBegin + SrcOff, NC * sizeof(uint64_t));
+        }
+        DstOffset += NC * sizeof(uint64_t);
+      }
+    }
+  }
+
+  // Relocate CounterPtr in data records for file layout.
+  // CounterPtr is device-relative offset; adjust for file layout where
+  // Data section comes first, then Counters section.
+  __llvm_profile_data *RelocatedData = (__llvm_profile_data *)BufDataBegin;
+  for (uint64_t i = 0; i < NumData; ++i) {
+    if (RelocatedData[i].CounterPtr) {
+      ptrdiff_t DeviceCounterPtrOffset = (ptrdiff_t)RelocatedData[i].CounterPtr;
+      const char *DeviceDataStructAddr =
+          (const char *)DevDataBegin + (i * sizeof(__llvm_profile_data));
+      const char *DeviceCountersAddr =
+          DeviceDataStructAddr + DeviceCounterPtrOffset;
+      ptrdiff_t OffsetIntoCountersSection =
+          DeviceCountersAddr - (const char *)DevCntsBegin;
+
+      ptrdiff_t NewRelativeOffset = DataSize + PaddingBytesBeforeCounters +
+                                    OffsetIntoCountersSection -
+                                    (i * sizeof(__llvm_profile_data));
+      memcpy((char *)RelocatedData + i * sizeof(__llvm_profile_data) +
+                 offsetof(__llvm_profile_data, CounterPtr),
+             &NewRelativeOffset, sizeof(NewRelativeOffset));
+    }
+    {
+      ptrdiff_t Zero = 0;
+      memcpy((char *)RelocatedData + i * sizeof(__llvm_profile_data) +
+                 offsetof(__llvm_profile_data, UniformCounterPtr),
+             &Zero, sizeof(Zero));
+    }
+    memset((char *)RelocatedData + i * sizeof(__llvm_profile_data) +
+               offsetof(__llvm_profile_data, BitmapPtr),
+           0,
+           sizeof(RelocatedData[i].BitmapPtr) +
+               sizeof(RelocatedData[i].FunctionPointer) +
+               sizeof(RelocatedData[i].Values));
+  }
+
+  // Relocate UniformCounterPtr for file layout. The ucnts section sits at
+  // offset UCFileOffset from the data section start. After reordering, function
+  // i's ucnts start at cumulative offset within the ucnts section.
+  ptrdiff_t UCFileOffset = DataSize + PaddingBytesBeforeCounters +
+                           CountersSize + PaddingBytesAfterCounters + 0 +
+                           PaddingBytesAfterBitmapBytes;
+  if (HostUniformCountersBegin) {
+    size_t CumulativeUCOffset = 0;
+    for (uint64_t i = 0; i < NumData; ++i) {
+      ptrdiff_t NewUCRelativeOffset =
+          UCFileOffset + (ptrdiff_t)CumulativeUCOffset -
+          (ptrdiff_t)(i * sizeof(__llvm_profile_data));
+      memcpy((char *)RelocatedData + i * sizeof(__llvm_profile_data) +
+                 offsetof(__llvm_profile_data, UniformCounterPtr),
+             &NewUCRelativeOffset, sizeof(NewUCRelativeOffset));
+      CumulativeUCOffset += RelocatedData[i].NumCounters * sizeof(uint64_t);
+    }
+  }
+
+  char TUIndexStr[16];
+  snprintf(TUIndexStr, sizeof(TUIndexStr), "%d", TUIndex);
+
+  char *UCBegin = ReorderedUniformCounters ? ReorderedUniformCounters
+                                           : HostUniformCountersBegin;
+  size_t UCSize =
+      ReorderedUniformCounters ? ReorderedUniformSize : UniformCountersSize;
+
+  ret = __llvm_write_custom_profile(
+      Target, TUIndexStr, (__llvm_profile_data *)BufDataBegin,
+      (__llvm_profile_data *)(BufDataBegin + DataSize), BufCountersBegin,
+      BufCountersBegin + CountersSize, UCBegin,
+      UCBegin ? UCBegin + UCSize : NULL, BufNamesBegin,
+      BufNamesBegin + NamesSize, NULL);
+
+  free(ContiguousBuffer);
+  free(ReorderedUniformCounters);
+
+  if (ret != 0) {
+    PROF_ERR("%s\n", "Failed to write device profile using shared API");
+  } else if (IsVerboseMode()) {
+    PROF_NOTE("%s\n", "Successfully wrote device profile using shared API");
+  }
+
+cleanup:
+  if (!CntsReused)
+    free(HostCountersBegin);
+  if (!DataReused)
+    free(HostDataBegin);
+  if (!NamesReused)
+    free(HostNamesBegin);
+  if (!UCntsReused)
+    free(HostUniformCountersBegin);
+  return ret;
+}
+
+static int ProcessShadowVariable(void *ShadowVar, int TUIndex,
+                                 const char *Target) {
+  void *DevicePtrVar = NULL;
+  if (hipGetSymbolAddress(&DevicePtrVar, ShadowVar) != 0) {
+    PROF_WARN("Failed to get symbol address for shadow variable %p\n",
+              ShadowVar);
+    return -1;
+  }
+  // The shadow variable is a pointer to __llvm_profile_sections (defined
+  // in the GPU profile runtime). Dereference to get the struct address.
+  void *DeviceOffloadPrf = NULL;
+  if (hipMemcpy(&DeviceOffloadPrf, DevicePtrVar, sizeof(void *), 2 /*DToH*/) !=
+      0) {
+    PROF_WARN("Failed to read sections pointer from shadow variable %p\n",
+              ShadowVar);
+    return -1;
+  }
+  return ProcessDeviceOffloadPrf(DeviceOffloadPrf, TUIndex, Target);
+}
+
+/* Check if HIP runtime is available and loaded */
+static int IsHipAvailable(void) {
+  EnsureHipLoaded();
+  return pHipMemcpy != NULL && pHipGetSymbolAddress != NULL;
+}
+
+/* -------------------------------------------------------------------------- */
+/*  Collect device-side profile data                                          */
+/* -------------------------------------------------------------------------- */
+
+int __llvm_profile_hip_collect_device_data(void) {
+  if (NumShadowVariables == 0 && NumDynamicModules == 0)
+    return 0;
+
+  if (!IsHipAvailable())
+    return 0;
+
+  int Ret = 0;
+
+  /* Shadow variables (static-linked kernels).
+   * Iterate over all devices to collect profile data from each GPU. */
+  if (NumShadowVariables > 0) {
+    int OrigDevice = -1;
+    hipGetDevice(&OrigDevice);
+
+    for (int Dev = 0; Dev < NumDevices; ++Dev) {
+      if (hipSetDevice(Dev) != 0) {
+        if (IsVerboseMode())
+          PROF_NOTE("Failed to set device %d, skipping\n", Dev);
+        continue;
+      }
+      const char *ArchName = getDeviceArchName(Dev);
+      if (IsVerboseMode())
+        PROF_NOTE("Collecting static profile data from device %d (%s)\n", Dev,
+                  ArchName);
+      for (int i = 0; i < NumShadowVariables; ++i) {
+        if (ProcessShadowVariable(OffloadShadowVariables[i], i, ArchName) != 0)
+          Ret = -1;
+      }
+    }
+
+    if (OrigDevice >= 0)
+      hipSetDevice(OrigDevice);
+  }
+
+  /* Dynamically-loaded modules — warn about any unprocessed TUs */
+  for (int i = 0; i < NumDynamicModules; ++i) {
+    OffloadDynamicModuleInfo *MI = &DynamicModules[i];
+    for (int t = 0; t < MI->NumTUs; ++t) {
+      if (!MI->TUs[t].Processed) {
+        PROF_WARN("Dynamic module %p TU %d was not processed before exit\n",
+                  MI->ModulePtr, t);
+        Ret = -1;
+      }
+    }
+  }
+
+  return Ret;
+}
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 633fdb9661162..2c459acd39af0 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -256,7 +256,9 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer,
   const char *VNamesEnd = __llvm_profile_end_vtabnames();
   uint64_t Version = __llvm_profile_get_version();
   return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin,
-                            CountersEnd, BitmapBegin, BitmapEnd, VPDataReader,
+                            CountersEnd, BitmapBegin, BitmapEnd,
+                            /*UniformCountersBegin=*/NULL,
+                            /*UniformCountersEnd=*/NULL, VPDataReader,
                             NamesBegin, NamesEnd, VTableBegin, VTableEnd,
                             VNamesBegin, VNamesEnd, SkipNameDataWrite, Version);
 }
@@ -265,6 +267,7 @@ COMPILER_RT_VISIBILITY int lprofWriteDataImpl(
     ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
     const __llvm_profile_data *DataEnd, const char *CountersBegin,
     const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
+    const char *UniformCountersBegin, const char *UniformCountersEnd,
     VPDataReaderType *VPDataReader, const char *NamesBegin,
     const char *NamesEnd, const VTableProfData *VTableBegin,
     const VTableProfData *VTableEnd, const char *VNamesBegin,
@@ -286,6 +289,13 @@ COMPILER_RT_VISIBILITY int lprofWriteDataImpl(
       __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd);
   const uint64_t VNamesSize =
       __llvm_profile_get_name_size(VNamesBegin, VNamesEnd);
+  const uint64_t NumUniformCounters =
+      (UniformCountersBegin && UniformCountersEnd &&
+       UniformCountersEnd > UniformCountersBegin)
+          ? (UniformCountersEnd - UniformCountersBegin) / sizeof(uint64_t)
+          : 0;
+  const uint64_t UniformCountersSectionSize =
+      NumUniformCounters * sizeof(uint64_t);
 
   /* Create the header. */
   __llvm_profile_header Header;
@@ -293,12 +303,13 @@ COMPILER_RT_VISIBILITY int lprofWriteDataImpl(
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames,
-      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+      PaddingBytesAfterBitmapBytes, PaddingBytesAfterUniformCounters,
+      PaddingBytesAfterNames, PaddingBytesAfterVTable, PaddingBytesAfterVNames;
   if (__llvm_profile_get_padding_sizes_for_counters(
-          DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
-          VTableSectionSize, VNamesSize, &PaddingBytesBeforeCounters,
-          &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+          DataSectionSize, CountersSectionSize, NumBitmapBytes,
+          NumUniformCounters, NamesSize, VTableSectionSize, VNamesSize,
+          &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+          &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterUniformCounters,
           &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
           &PaddingBytesAfterVNames) == -1)
     return -1;
@@ -315,12 +326,25 @@ COMPILER_RT_VISIBILITY int lprofWriteDataImpl(
 #ifdef _WIN64
   Header.CountersDelta = (uint32_t)Header.CountersDelta;
   Header.BitmapDelta = (uint32_t)Header.BitmapDelta;
+  Header.UniformCountersDelta = (uint32_t)Header.UniformCountersDelta;
 #endif
 
+  /* Recompute UniformCountersDelta from file layout. The macro initializer
+     uses in-memory pointer arithmetic which is wrong when sections are in
+     separate allocations (e.g., custom profiles from device PGO). */
+  if (NumUniformCounters > 0)
+    Header.UniformCountersDelta = DataSectionSize + PaddingBytesBeforeCounters +
+                                  CountersSectionSize +
+                                  PaddingBytesAfterCounters + NumBitmapBytes +
+                                  PaddingBytesAfterBitmapBytes;
+  else
+    Header.UniformCountersDelta = 0;
+
   /* The data and names sections are omitted in lightweight mode. */
   if (NumData == 0 && NamesSize == 0) {
     Header.CountersDelta = 0;
     Header.NamesDelta = 0;
+    Header.UniformCountersDelta = 0;
   }
 
   /* Write the profile header. */
@@ -340,6 +364,8 @@ COMPILER_RT_VISIBILITY int lprofWriteDataImpl(
       {NULL, sizeof(uint8_t), PaddingBytesAfterCounters, 1},
       {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
+      {UniformCountersBegin, sizeof(uint8_t), UniformCountersSectionSize, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterUniformCounters, 1},
       {SkipNameDataWrite ? NULL : NamesBegin, sizeof(uint8_t), NamesSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1},
       {VTableBegin, sizeof(uint8_t), VTableSectionSize, 0},
diff --git a/compiler-rt/test/asan/TestCases/log-path_test.cpp b/compiler-rt/test/asan/TestCases/log-path_test.cpp
index 38a1ee1e12fbe..b454129d809bb 100644
--- a/compiler-rt/test/asan/TestCases/log-path_test.cpp
+++ b/compiler-rt/test/asan/TestCases/log-path_test.cpp
@@ -12,8 +12,8 @@
 // RUN: %env_asan_opts=log_path=%t.log not %run %t 2> %t.out
 // RUN: FileCheck %s --check-prefix=CHECK-ERROR < %t.log.*
 
-// Invalid log_path in existing directory.
-// RUN: %env_asan_opts=log_path=/INVALID not %run %t 2> %t.out
+// Invalid log_path (see memprof log_path_test.cpp: /INVALID.<pid> is creatable as root).
+// RUN: %env_asan_opts=log_path=/proc/self/mem not %run %t 2> %t.out
 // RUN: FileCheck %s --check-prefix=CHECK-INVALID < %t.out
 
 // Directory of log_path can't be created.
@@ -45,6 +45,6 @@ int main(int argc, char **argv) {
   return res;
 }
 // CHECK-ERROR: ERROR: AddressSanitizer
-// CHECK-INVALID: ERROR: Can't open file: /INVALID
+// CHECK-INVALID: ERROR: Can't open file: /proc/self/mem.
 // CHECK-BAD-DIR: ERROR: Can't create directory: /dev/null
 // CHECK-LONG: ERROR: Path is too long: 01234
diff --git a/compiler-rt/test/memprof/TestCases/log_path_test.cpp b/compiler-rt/test/memprof/TestCases/log_path_test.cpp
index 4c38bd56ebb64..18d5e246c3e9a 100644
--- a/compiler-rt/test/memprof/TestCases/log_path_test.cpp
+++ b/compiler-rt/test/memprof/TestCases/log_path_test.cpp
@@ -8,8 +8,14 @@
 // RUN: %env_memprof_opts=print_text=true:log_path=%t.log %run %t
 // RUN: FileCheck %s --check-prefix=CHECK-GOOD --dump-input=always < %t.log.*
 
-// Invalid log_path.
-// RUN: %env_memprof_opts=print_text=true:log_path=/INVALID not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-INVALID --dump-input=always
+// Invalid log_path: the runtime opens "<prefix>.<pid>". Historically tests used a
+// prefix like /INVALID, which becomes /INVALID.<pid> under /. That fails for a
+// normal user (no write permission on /) but succeeds for root, which is common in
+// CI and default Docker images—so the sanitizer would open a real file, emit no
+// ERROR, and `not %run` would fail. Use /proc/self/mem instead: the resulting path
+// is not openable as a writable log file even for root, so we still get the
+// expected diagnostic and exit status.
+// RUN: %env_memprof_opts=print_text=true:log_path=/proc/self/mem not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-INVALID --dump-input=always
 
 // Directory of log_path can't be created.
 // RUN: %env_memprof_opts=print_text=true:log_path=/dev/null/INVALID not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-BAD-DIR --dump-input=always
@@ -19,13 +25,11 @@
 // RUN: %env_memprof_opts=print_text=true:log_path=%{readfile:%t.long_log_path} \
 // RUN:   not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-LONG --dump-input=always
 
-// Specifying the log name via the __memprof_profile_filename variable.
-// Note we use an invalid path since that is sufficient for checking that the
-// specified __memprof_profile_filename value is passed through to the runtime.
-// Using an automatically generated name via %t can cause weird issues with
-// unexpected macro expansion if the path includes tokens that match a build
-// system macro (e.g. "linux").
-// RUN: %clangxx_memprof  %s -o %t -DPROFILE_NAME_VAR="/INVALID"
+// Specifying the log name via the __memprof_profile_filename variable (same
+// unopenable prefix as the log_path= case above). Use -DPROFILE_NAME_VAR=/path
+// without extra shell quotes so the preprocessor yields a normal C string; forms
+// like -DPROFILE_NAME_VAR=\"/path\" stringify incorrectly with xstr()/str().
+// RUN: %clangxx_memprof  %s -o %t -DPROFILE_NAME_VAR=/proc/self/mem
 // RUN: not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-INVALID --dump-input=always
 
 #include <sanitizer/memprof_interface.h>
@@ -46,6 +50,7 @@ int main(int argc, char **argv) {
   return 0;
 }
 // CHECK-GOOD: Memory allocation stack id
-// CHECK-INVALID: ERROR: Can't open file: /INVALID
+// The next line matches /proc/self/mem.<pid> stderr from the invalid log_path RUNs above.
+// CHECK-INVALID: ERROR: Can't open file: /proc/self/mem.
 // CHECK-BAD-DIR: ERROR: Can't create directory: /dev/null
 // CHECK-LONG: ERROR: Path is too long: 01234
diff --git a/libc/test/lit.site.cfg.py.in b/libc/test/lit.site.cfg.py.in
index 7773bdfdf0e9c..35501cf6b1564 100644
--- a/libc/test/lit.site.cfg.py.in
+++ b/libc/test/lit.site.cfg.py.in
@@ -12,7 +12,9 @@ config.libc_gpu_loader = path(r"@LIBC_GPU_LOADER_EXECUTABLE@")
 
 # If running GPU tests and no explicit test command is set, use the GPU loader.
 if not config.libc_test_cmd and config.libc_gpu_loader:
-    config.libc_test_cmd = config.libc_gpu_loader + " @BINARY@"
+    # Placeholder must not use raw at-sign CMake tokens; configure_file would eat them.
+    _libc_lit_exe = chr(64) + "BINARY" + chr(64)
+    config.libc_test_cmd = config.libc_gpu_loader + " " + _libc_lit_exe
 
 # Add libc's utils directory to the path so we can import the test format.
 site.addsitedir(os.path.join(config.libc_src_root, "utils"))
diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def
index 0d79677d7079e..d238c81ecc152 100644
--- a/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -60,3 +60,4 @@ LLVM_FIXED_MD_KIND(MD_alloc_token, "alloc_token", 45)
 LLVM_FIXED_MD_KIND(MD_implicit_ref, "implicit.ref", 46)
 LLVM_FIXED_MD_KIND(MD_nofpclass, "nofpclass", 47)
 LLVM_FIXED_MD_KIND(MD_call_target, "call_target", 48)
+LLVM_FIXED_MD_KIND(MD_block_uniformity_profile, "block-uniformity-profile", 49)
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index f742476ac854a..e2b3bbbfdafe9 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -894,6 +894,16 @@ struct InstrProfValueSiteRecord {
 struct InstrProfRecord {
   std::vector<uint64_t> Counts;
   std::vector<uint8_t> BitmapBytes;
+  /// For AMDGPU offload profiling: raw uniform counters embedded in the
+  /// profraw file. One uint64_t per instrumented block, tracking wave-uniform
+  /// execution counts on the GPU.
+  std::vector<uint64_t> UniformCounts;
+  /// For AMDGPU offload profiling: 1 bit per basic block indicating whether
+  /// the block is entered via a wave-uniform branch. Set during merge when
+  /// per-slot counters are reduced. If a counter value is a multiple of the
+  /// wave size, the branch is considered wave-uniform.
+  std::vector<uint8_t> UniformityBits;
+  uint16_t OffloadDeviceWaveSize = 0;
 
   InstrProfRecord() = default;
   InstrProfRecord(std::vector<uint64_t> Counts) : Counts(std::move(Counts)) {}
@@ -903,6 +913,8 @@ struct InstrProfRecord {
   InstrProfRecord(InstrProfRecord &&) = default;
   InstrProfRecord(const InstrProfRecord &RHS)
       : Counts(RHS.Counts), BitmapBytes(RHS.BitmapBytes),
+        UniformCounts(RHS.UniformCounts), UniformityBits(RHS.UniformityBits),
+        OffloadDeviceWaveSize(RHS.OffloadDeviceWaveSize),
         ValueData(RHS.ValueData
                       ? std::make_unique<ValueProfData>(*RHS.ValueData)
                       : nullptr) {}
@@ -910,6 +922,8 @@ struct InstrProfRecord {
   InstrProfRecord &operator=(const InstrProfRecord &RHS) {
     Counts = RHS.Counts;
     BitmapBytes = RHS.BitmapBytes;
+    UniformityBits = RHS.UniformityBits;
+    OffloadDeviceWaveSize = RHS.OffloadDeviceWaveSize;
     if (!RHS.ValueData) {
       ValueData = nullptr;
       return *this;
@@ -921,6 +935,17 @@ struct InstrProfRecord {
     return *this;
   }
 
+  /// Check if a basic block is entered via a wave-uniform branch.
+  /// Returns true if uniform (safe for PGO spill optimization) or if no
+  /// uniformity data is available (conservative default).
+  bool isBlockUniform(unsigned BlockIdx) const {
+    if (UniformityBits.empty())
+      return true; // No uniformity data, assume uniform (conservative)
+    if (BlockIdx / 8 >= UniformityBits.size())
+      return true; // Out of range, assume uniform
+    return (UniformityBits[BlockIdx / 8] >> (BlockIdx % 8)) & 1;
+  }
+
   /// Return the number of value profile kinds with non-zero number
   /// of profile sites.
   inline uint32_t getNumValueKinds() const;
@@ -945,8 +970,12 @@ struct InstrProfRecord {
 
   /// Merge the counts in \p Other into this one.
   /// Optionally scale merged counts by \p Weight.
+  /// If \p WaveSize is non-zero and Other has offload profiling slots,
+  /// compute uniformity bits based on whether counter values are multiples
+  /// of WaveSize.
   LLVM_ABI void merge(InstrProfRecord &Other, uint64_t Weight,
-                      function_ref<void(instrprof_error)> Warn);
+                      function_ref<void(instrprof_error)> Warn,
+                      unsigned WaveSize = 0);
 
   /// Scale up profile counts (including value profile data) by
   /// a factor of (N / D).
@@ -1071,6 +1100,14 @@ struct NamedInstrProfRecord : InstrProfRecord {
                        std::vector<uint8_t> BitmapBytes)
       : InstrProfRecord(std::move(Counts), std::move(BitmapBytes)), Name(Name),
         Hash(Hash) {}
+  NamedInstrProfRecord(StringRef Name, uint64_t Hash,
+                       std::vector<uint64_t> Counts,
+                       std::vector<uint8_t> BitmapBytes,
+                       std::vector<uint8_t> UniformityBits)
+      : InstrProfRecord(std::move(Counts), std::move(BitmapBytes)), Name(Name),
+        Hash(Hash) {
+    this->UniformityBits = std::move(UniformityBits);
+  }
 
   static bool hasCSFlagInHash(uint64_t FuncHash) {
     return ((FuncHash >> CS_FLAG_IN_FUNC_HASH) & 1);
@@ -1177,7 +1214,9 @@ enum ProfVersion {
   Version12 = 12,
   // In this version, the frontend PGO stable hash algorithm defaults to V4.
   Version13 = 13,
-  // The current version is 13.
+  // UniformityBits added for AMDGPU offload profiling divergence detection.
+  Version14 = 14,
+  // The current version is 14.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 7525feab8f133..c1aa4595b8736 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -78,6 +78,8 @@ INSTR_PROF_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), FuncHash, \
                 ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
                 Inc->getHash()->getZExtValue()))
 INSTR_PROF_DATA(const IntPtrT, IntPtrTy, CounterPtr, RelativeCounterPtr)
+INSTR_PROF_DATA(const IntPtrT, IntPtrTy, UniformCounterPtr, \
+                RelativeUniformCounterPtr)
 INSTR_PROF_DATA(const IntPtrT, IntPtrTy, BitmapPtr, RelativeBitmapPtr)
 /* This is used to map function pointers for the indirect call targets to
  * function name hashes during the conversion from raw to merged profile
@@ -89,9 +91,13 @@ INSTR_PROF_DATA(IntPtrT, llvm::PointerType::getUnqual(Ctx), Values, \
                 ValuesPtrExpr)
 INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumCounters, \
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumCounters))
-INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last+1], \
-                ConstantArray::get(Int16ArrayTy, Int16ArrayVals)) \
-INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
+INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last + 1],
+                ConstantArray::get(Int16ArrayTy, Int16ArrayVals))
+INSTR_PROF_DATA(const uint16_t, llvm::Type::getInt16Ty(Ctx),
+                OffloadDeviceWaveSize,
+                ConstantInt::get(llvm::Type::getInt16Ty(Ctx),
+                                 OffloadDeviceWaveSizeVal))
+INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes,
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumBitmapBytes))
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
@@ -168,6 +174,12 @@ INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \
 INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \
                     Ctx), DataStop,                                            \
                     ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
+INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx),                 \
+                    UniformCountersStart,                                       \
+                    ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
+INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx),                 \
+                    UniformCountersStop,                                        \
+                    ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
 INSTR_PROF_GPU_SECT(uint64_t *, llvm::PointerType::getUnqual(Ctx),             \
                     VersionVar,                                                \
                     ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
@@ -192,6 +204,10 @@ INSTR_PROF_RAW_HEADER(uint64_t, NumCounters, NumCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterCounters, PaddingBytesAfterCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, NumBitmapBytes, NumBitmapBytes)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterBitmapBytes, PaddingBytesAfterBitmapBytes)
+INSTR_PROF_RAW_HEADER(uint64_t, NumUniformCounters, NumUniformCounters)
+INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterUniformCounters, PaddingBytesAfterUniformCounters)
+INSTR_PROF_RAW_HEADER(uint64_t, UniformCountersDelta,
+    UniformCountersBegin ? (uintptr_t)UniformCountersBegin - (uintptr_t)DataBegin : 0)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
                       (uintptr_t)CountersBegin - (uintptr_t)DataBegin)
@@ -356,6 +372,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
 INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
                       INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
                       INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_ucnts, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_UCNTS_COMMON), \
+                      INSTR_PROF_UCNTS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
                       INSTR_PROF_BITS_COFF, "__DATA,")
@@ -752,9 +771,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 10
+#define INSTR_PROF_RAW_VERSION 11
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 13
+#define INSTR_PROF_INDEX_VERSION 14
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -803,6 +822,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
 #define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_UCNTS_COMMON __llvm_prf_ucnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
@@ -820,6 +840,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
 #define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
+#define INSTR_PROF_UCNTS_COFF ".lprfuc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 134195059f9e6..4d6351f6425ef 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -349,6 +349,7 @@ class RawInstrProfReader : public InstrProfReader {
   uint64_t Version;
   uint64_t CountersDelta;
   uint64_t BitmapDelta;
+  uint64_t UniformCountersDelta;
   uint64_t NamesDelta;
   const RawInstrProf::ProfileData<IntPtrT> *Data;
   const RawInstrProf::ProfileData<IntPtrT> *DataEnd;
@@ -358,6 +359,8 @@ class RawInstrProfReader : public InstrProfReader {
   const char *CountersEnd;
   const char *BitmapStart;
   const char *BitmapEnd;
+  const char *UniformCountersStart;
+  const char *UniformCountersEnd;
   const char *NamesStart;
   const char *NamesEnd;
   const char *VNamesStart = nullptr;
@@ -469,6 +472,7 @@ class RawInstrProfReader : public InstrProfReader {
   Error readFuncHash(NamedInstrProfRecord &Record);
   Error readRawCounts(InstrProfRecord &Record);
   Error readRawBitmapBytes(InstrProfRecord &Record);
+  Error readRawUniformCounters(InstrProfRecord &Record);
   Error readValueProfilingData(InstrProfRecord &Record);
   bool atEnd() const { return Data == DataEnd; }
 
@@ -482,6 +486,7 @@ class RawInstrProfReader : public InstrProfReader {
       // with respect to the next record.
       CountersDelta -= sizeof(*Data);
       BitmapDelta -= sizeof(*Data);
+      UniformCountersDelta -= sizeof(*Data);
     }
     Data++;
     ValueDataStart += CurValueDataSize;
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index 1b24425e68a9e..93212c19769e0 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -215,6 +215,7 @@ class InstrProfWriter {
     MemProfVersionRequested = Version;
   }
   void setMemProfFullSchema(bool Full) { MemProfFullSchema = Full; }
+
   // Compute the overlap b/w this object and Other. Program level result is
   // stored in Overlap and function level result is stored in FuncLevelOverlap.
   LLVM_ABI void overlapRecord(NamedInstrProfRecord &&Other,
diff --git a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
index 6b93b6cb83b4e..efef78eadd31e 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
@@ -286,12 +286,30 @@ template <class Edge, class BBInfo> class CFGMST {
     if (!Message.str().empty())
       OS << Message << "\n";
     OS << "  Number of Basic Blocks: " << BBInfos.size() << "\n";
-    for (auto &BI : BBInfos) {
-      const BasicBlock *BB = BI.first;
+    // Collect and sort BBInfos deterministically by their assigned Index.
+    std::vector<std::pair<const BasicBlock *, const BBInfo *>> SortedBBInfos;
+    SortedBBInfos.reserve(BBInfos.size());
+    for (const auto &BI : BBInfos)
+      SortedBBInfos.emplace_back(BI.first, BI.second.get());
+
+    llvm::sort(SortedBBInfos, [](const auto &A, const auto &B) {
+      // Primary key: BBInfo Index
+      if (A.second->Index != B.second->Index)
+        return A.second->Index < B.second->Index;
+      // Secondary key: name string to keep a stable order even if
+      // indices tie (ties shouldn't happen, but this makes ordering
+      // explicit).
+      StringRef NameA = A.first ? A.first->getName() : StringRef("FakeNode");
+      StringRef NameB = B.first ? B.first->getName() : StringRef("FakeNode");
+      return NameA < NameB;
+    });
+
+    for (const auto &P : SortedBBInfos) {
+      const BasicBlock *BB = P.first;
+      const BBInfo *Info = P.second;
       OS << "  BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << "  "
-         << BI.second->infoString() << "\n";
+         << Info->infoString() << "\n";
     }
-
     OS << "  Number of Edges: " << AllEdges.size()
        << " (*: Instrument, C: CriticalEdge, -: Removed)\n";
     uint32_t Count = 0;
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index b96db851fa6bd..f317f82fbf0ce 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -950,7 +950,8 @@ void InstrProfRecord::mergeValueProfData(
 }
 
 void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight,
-                            function_ref<void(instrprof_error)> Warn) {
+                            function_ref<void(instrprof_error)> Warn,
+                            unsigned WaveSize) {
   // If the number of counters doesn't match we either have bad data
   // or a hash collision.
   if (Counts.size() != Other.Counts.size()) {
@@ -975,7 +976,7 @@ void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight,
       setPseudoCount(PseudoWarm);
     return;
   }
-
+  OffloadDeviceWaveSize = Other.OffloadDeviceWaveSize;
   for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) {
     bool Overflowed;
     uint64_t Value =
@@ -1685,7 +1686,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
       IndexedInstrProf::ProfVersion::CurrentVersion)
     return make_error<InstrProfError>(instrprof_error::unsupported_version);
 
-  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version13,
+  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version14,
                 "Please update the reader as needed when a new field is added "
                 "or when indexed profile version gets bumped.");
 
@@ -1718,10 +1719,11 @@ size_t Header::size() const {
     // of the header, and byte offset of existing fields shouldn't change when
     // indexed profile version gets incremented.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version13,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version14,
         "Please update the size computation below if a new field has "
         "been added to the header; for a version bump without new "
         "fields, add a case statement to fall through to the latest version.");
+  case 14ull: // UniformityBits added in record data, no header change
   case 13ull:
   case 12ull:
     return 72;
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index b38189de31606..41bc0cc761281 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -311,6 +311,7 @@ void InstrProfCorrelatorImpl<IntPtrT>::addDataProbe(uint64_t NameRef,
       // In this mode, CounterPtr actually stores the section relative address
       // of the counter.
       maybeSwap<IntPtrT>(CounterOffset),
+      /*UniformCounterPtr=*/maybeSwap<IntPtrT>(0),
       // TODO: MC/DC is not yet supported.
       /*BitmapOffset=*/maybeSwap<IntPtrT>(0),
       maybeSwap<IntPtrT>(FunctionPtr),
@@ -318,6 +319,7 @@ void InstrProfCorrelatorImpl<IntPtrT>::addDataProbe(uint64_t NameRef,
       /*ValuesPtr=*/maybeSwap<IntPtrT>(0),
       maybeSwap<uint32_t>(NumCounters),
       /*NumValueSites=*/{maybeSwap<uint16_t>(0), maybeSwap<uint16_t>(0)},
+      /*OffloadDeviceWaveSize=*/maybeSwap<uint16_t>(0),
       // TODO: MC/DC is not yet supported.
       /*NumBitmapBytes=*/maybeSwap<uint32_t>(0),
   });
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 8147ee8d0e816..b0bf80c0a7a6e 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -607,6 +607,7 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
 
   CountersDelta = swap(Header.CountersDelta);
   BitmapDelta = swap(Header.BitmapDelta);
+  UniformCountersDelta = swap(Header.UniformCountersDelta);
   NamesDelta = swap(Header.NamesDelta);
   auto NumData = swap(Header.NumData);
   auto PaddingBytesBeforeCounters = swap(Header.PaddingBytesBeforeCounters);
@@ -614,6 +615,9 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   auto PaddingBytesAfterCounters = swap(Header.PaddingBytesAfterCounters);
   auto NumBitmapBytes = swap(Header.NumBitmapBytes);
   auto PaddingBytesAfterBitmapBytes = swap(Header.PaddingBytesAfterBitmapBytes);
+  auto NumUniformCounters = swap(Header.NumUniformCounters);
+  auto PaddingBytesAfterUniformCounters =
+      swap(Header.PaddingBytesAfterUniformCounters);
   auto NamesSize = swap(Header.NamesSize);
   auto VTableNameSize = swap(Header.VNamesSize);
   auto NumVTables = swap(Header.NumVTables);
@@ -626,14 +630,17 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   auto VTableSectionSize =
       NumVTables * sizeof(RawInstrProf::VTableProfileData<IntPtrT>);
   auto PaddingBytesAfterVTableProfData = getNumPaddingBytes(VTableSectionSize);
+  auto UniformCountersSectionSize = NumUniformCounters * sizeof(uint64_t);
 
   // Profile data starts after profile header and binary ids if exist.
   ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdSize;
   ptrdiff_t CountersOffset = DataOffset + DataSize + PaddingBytesBeforeCounters;
   ptrdiff_t BitmapOffset =
       CountersOffset + CountersSize + PaddingBytesAfterCounters;
-  ptrdiff_t NamesOffset =
+  ptrdiff_t UniformCountersOffset =
       BitmapOffset + NumBitmapBytes + PaddingBytesAfterBitmapBytes;
+  ptrdiff_t NamesOffset = UniformCountersOffset + UniformCountersSectionSize +
+                          PaddingBytesAfterUniformCounters;
   ptrdiff_t VTableProfDataOffset =
       NamesOffset + NamesSize + PaddingBytesAfterNames;
   ptrdiff_t VTableNameOffset = VTableProfDataOffset + VTableSectionSize +
@@ -694,6 +701,8 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   CountersEnd = CountersStart + CountersSize;
   BitmapStart = Start + BitmapOffset;
   BitmapEnd = BitmapStart + NumBitmapBytes;
+  UniformCountersStart = Start + UniformCountersOffset;
+  UniformCountersEnd = UniformCountersStart + UniformCountersSectionSize;
   ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset);
 
   std::unique_ptr<InstrProfSymtab> NewSymtab = std::make_unique<InstrProfSymtab>();
@@ -827,6 +836,52 @@ Error RawInstrProfReader<IntPtrT>::readRawBitmapBytes(InstrProfRecord &Record) {
   return success();
 }
 
+template <class IntPtrT>
+Error RawInstrProfReader<IntPtrT>::readRawUniformCounters(
+    InstrProfRecord &Record) {
+  Record.UniformCounts.clear();
+
+  if (UniformCountersStart == UniformCountersEnd)
+    return success();
+
+  uint32_t NumCounters = swap(Data->NumCounters);
+
+  ptrdiff_t UniformCounterOffset =
+      swap(Data->UniformCounterPtr) - UniformCountersDelta;
+  if (UniformCounterOffset < 0)
+    return error(instrprof_error::malformed,
+                 ("uniform counter offset " + Twine(UniformCounterOffset) +
+                  " is negative")
+                     .str());
+
+  if (UniformCounterOffset >= UniformCountersEnd - UniformCountersStart)
+    return error(instrprof_error::malformed,
+                 ("uniform counter offset " + Twine(UniformCounterOffset) +
+                  " is greater than the maximum uniform counter offset " +
+                  Twine(UniformCountersEnd - UniformCountersStart - 1))
+                     .str());
+
+  uint64_t MaxNumCounters =
+      (UniformCountersEnd - (UniformCountersStart + UniformCounterOffset)) /
+      sizeof(uint64_t);
+  if (NumCounters > MaxNumCounters)
+    return error(instrprof_error::malformed,
+                 ("number of uniform counters " + Twine(NumCounters) +
+                  " is greater than the maximum number of uniform counters " +
+                  Twine(MaxNumCounters))
+                     .str());
+
+  Record.UniformCounts.reserve(NumCounters);
+  for (uint32_t I = 0; I < NumCounters; I++) {
+    const char *Ptr =
+        UniformCountersStart + UniformCounterOffset + I * sizeof(uint64_t);
+    uint64_t CounterValue = swap(*reinterpret_cast<const uint64_t *>(Ptr));
+    Record.UniformCounts.push_back(CounterValue);
+  }
+
+  return success();
+}
+
 template <class IntPtrT>
 Error RawInstrProfReader<IntPtrT>::readValueProfilingData(
     InstrProfRecord &Record) {
@@ -873,6 +928,8 @@ Error RawInstrProfReader<IntPtrT>::readNextRecord(NamedInstrProfRecord &Record)
   if (Error E = readFuncHash(Record))
     return error(std::move(E));
 
+  Record.OffloadDeviceWaveSize = swap(Data->OffloadDeviceWaveSize);
+
   // Read raw counts and set Record.
   if (Error E = readRawCounts(Record))
     return error(std::move(E));
@@ -881,6 +938,10 @@ Error RawInstrProfReader<IntPtrT>::readNextRecord(NamedInstrProfRecord &Record)
   if (Error E = readRawBitmapBytes(Record))
     return error(std::move(E));
 
+  // Read raw uniform counters and set Record.
+  if (Error E = readRawUniformCounters(Record))
+    return error(std::move(E));
+
   // Read value data and set Record.
   if (Error E = readValueProfilingData(Record))
     return error(std::move(E));
@@ -945,11 +1006,12 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
   DataBuffer.clear();
   std::vector<uint64_t> CounterBuffer;
   std::vector<uint8_t> BitmapByteBuffer;
+  std::vector<uint8_t> UniformityBitsBuffer;
 
   const unsigned char *End = D + N;
   while (D < End) {
     // Read hash.
-    if (D + sizeof(uint64_t) >= End)
+    if (D + sizeof(uint64_t) > End)
       return data_type();
     uint64_t Hash = endian::readNext<uint64_t, llvm::endianness::little>(D);
 
@@ -977,18 +1039,51 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
       if (D + sizeof(uint64_t) > End)
         return data_type();
       BitmapBytes = endian::readNext<uint64_t, llvm::endianness::little>(D);
-      // Read bitmap byte values.
-      if (D + BitmapBytes * sizeof(uint8_t) > End)
-        return data_type();
       BitmapByteBuffer.clear();
       BitmapByteBuffer.reserve(BitmapBytes);
-      for (uint64_t J = 0; J < BitmapBytes; ++J)
-        BitmapByteBuffer.push_back(static_cast<uint8_t>(
-            endian::readNext<uint64_t, llvm::endianness::little>(D)));
+
+      if (GET_VERSION(FormatVersion) >=
+          IndexedInstrProf::ProfVersion::Version14) {
+        // Version 14+: bitmap bytes stored as uint8_t with padding.
+        uint64_t PaddedSize = alignTo(BitmapBytes, sizeof(uint64_t));
+        if (D + PaddedSize > End)
+          return data_type();
+        for (uint64_t J = 0; J < BitmapBytes; ++J)
+          BitmapByteBuffer.push_back(
+              endian::readNext<uint8_t, llvm::endianness::little>(D));
+        for (uint64_t J = BitmapBytes; J < PaddedSize; ++J)
+          (void)endian::readNext<uint8_t, llvm::endianness::little>(D);
+
+        // Read uniformity bits (AMDGPU offload profiling).
+        uint64_t UniformityBitsSize = 0;
+        if (D + sizeof(uint64_t) > End)
+          return data_type();
+        UniformityBitsSize =
+            endian::readNext<uint64_t, llvm::endianness::little>(D);
+        uint64_t PaddedUniformitySize =
+            alignTo(UniformityBitsSize, sizeof(uint64_t));
+        if (D + PaddedUniformitySize > End)
+          return data_type();
+        UniformityBitsBuffer.clear();
+        UniformityBitsBuffer.reserve(UniformityBitsSize);
+        for (uint64_t J = 0; J < UniformityBitsSize; ++J)
+          UniformityBitsBuffer.push_back(
+              endian::readNext<uint8_t, llvm::endianness::little>(D));
+        for (uint64_t J = UniformityBitsSize; J < PaddedUniformitySize; ++J)
+          (void)endian::readNext<uint8_t, llvm::endianness::little>(D);
+      } else {
+        // Version 11-13: each bitmap byte stored as a uint64_t.
+        if (D + BitmapBytes * sizeof(uint64_t) > End)
+          return data_type();
+        for (uint64_t J = 0; J < BitmapBytes; ++J)
+          BitmapByteBuffer.push_back(static_cast<uint8_t>(
+              endian::readNext<uint64_t, llvm::endianness::little>(D)));
+      }
     }
 
     DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer),
-                            std::move(BitmapByteBuffer));
+                            std::move(BitmapByteBuffer),
+                            std::move(UniformityBitsBuffer));
 
     // Read value profiling data.
     if (GET_VERSION(FormatVersion) > IndexedInstrProf::ProfVersion::Version2 &&
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 0f15ca8ff6df7..96611612184e7 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -51,6 +51,7 @@ class InstrProfRecordWriterTrait {
   llvm::endianness ValueProfDataEndianness = llvm::endianness::little;
   InstrProfSummaryBuilder *SummaryBuilder;
   InstrProfSummaryBuilder *CSSummaryBuilder;
+  bool WritePrevVersion = false;
 
   InstrProfRecordWriterTrait() = default;
 
@@ -58,7 +59,7 @@ class InstrProfRecordWriterTrait {
     return IndexedInstrProf::ComputeHash(K);
   }
 
-  static std::pair<offset_type, offset_type>
+  std::pair<offset_type, offset_type>
   EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
     using namespace support;
 
@@ -74,7 +75,16 @@ class InstrProfRecordWriterTrait {
       M += sizeof(uint64_t); // The size of the Counts vector
       M += ProfRecord.Counts.size() * sizeof(uint64_t);
       M += sizeof(uint64_t); // The size of the Bitmap vector
-      M += ProfRecord.BitmapBytes.size() * sizeof(uint64_t);
+      if (WritePrevVersion) {
+        // Version 13: each bitmap byte stored as a uint64_t.
+        M += ProfRecord.BitmapBytes.size() * sizeof(uint64_t);
+      } else {
+        // Version 14+: bitmap bytes as uint8_t with padding, plus
+        // uniformity bits.
+        M += alignTo(ProfRecord.BitmapBytes.size(), sizeof(uint64_t));
+        M += sizeof(uint64_t); // The size of the UniformityBits vector
+        M += alignTo(ProfRecord.UniformityBits.size(), sizeof(uint64_t));
+      }
 
       // Value data
       M += ValueProfData::getSize(ProfileData.second);
@@ -88,7 +98,8 @@ class InstrProfRecordWriterTrait {
     Out.write(K.data(), N);
   }
 
-  void EmitData(raw_ostream &Out, key_type_ref, data_type_ref V, offset_type) {
+  void EmitData(raw_ostream &Out, key_type_ref K, data_type_ref V,
+                offset_type) {
     using namespace support;
 
     endian::Writer LE(Out, llvm::endianness::little);
@@ -105,8 +116,27 @@ class InstrProfRecordWriterTrait {
         LE.write<uint64_t>(I);
 
       LE.write<uint64_t>(ProfRecord.BitmapBytes.size());
-      for (uint64_t I : ProfRecord.BitmapBytes)
-        LE.write<uint64_t>(I);
+      if (WritePrevVersion) {
+        // Version 13: each bitmap byte stored as a uint64_t.
+        for (uint8_t I : ProfRecord.BitmapBytes)
+          LE.write<uint64_t>(I);
+      } else {
+        // Version 14+: bitmap bytes as uint8_t with padding.
+        for (uint8_t I : ProfRecord.BitmapBytes)
+          LE.write<uint8_t>(I);
+        for (size_t I = ProfRecord.BitmapBytes.size();
+             I < alignTo(ProfRecord.BitmapBytes.size(), sizeof(uint64_t)); ++I)
+          LE.write<uint8_t>(0);
+
+        // Write uniformity bits (AMDGPU offload profiling).
+        LE.write<uint64_t>(ProfRecord.UniformityBits.size());
+        for (uint8_t I : ProfRecord.UniformityBits)
+          LE.write<uint8_t>(I);
+        for (size_t I = ProfRecord.UniformityBits.size();
+             I < alignTo(ProfRecord.UniformityBits.size(), sizeof(uint64_t));
+             ++I)
+          LE.write<uint8_t>(0);
+      }
 
       // Write value data
       std::unique_ptr<ValueProfData> VDataPtr =
@@ -209,7 +239,7 @@ void InstrProfWriter::addRecord(StringRef Name, uint64_t Hash,
       Dest.scale(Weight, 1, MapWarn);
   } else {
     // We're updating a function we've seen before.
-    Dest.merge(I, Weight, MapWarn);
+    Dest.merge(I, Weight, MapWarn, I.OffloadDeviceWaveSize);
   }
 
   Dest.sortValueData();
@@ -524,6 +554,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   InfoObj->SummaryBuilder = &ISB;
   InstrProfSummaryBuilder CSISB(ProfileSummaryBuilder::DefaultCutoffs);
   InfoObj->CSSummaryBuilder = &CSISB;
+  InfoObj->WritePrevVersion = WritePrevVersion;
 
   // Populate the hash table generator.
   SmallVector<std::pair<StringRef, const ProfilingData *>> OrderedData;
@@ -542,7 +573,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // The WritePrevVersion handling will either need to be removed or updated
   // if the version is advanced beyond 12.
   static_assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
-                IndexedInstrProf::ProfVersion::Version13);
+                IndexedInstrProf::ProfVersion::Version14);
   if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     Header.Version |= VARIANT_MASK_IR_PROF;
   if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index dabd495cddd49..c2fcfb4dfa6b9 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Frontend/Offloading/Utility.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -33,12 +34,15 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -50,6 +54,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/TargetParser/TargetParser.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -160,6 +165,14 @@ cl::opt<bool> SpeculativeCounterPromotionToLoop(
              " update can be further/iteratively promoted into an acyclic "
              " region."));
 
+static cl::opt<unsigned> OffloadPGOSampling(
+    "offload-pgo-sampling",
+    cl::desc("Log2 of the sampling period for offload PGO instrumentation. "
+             "Only 1 in every 2^N blocks is instrumented. "
+             "0 = all blocks, 1 = 50%, 2 = 25%, 3 = 12.5% (default). "
+             "Higher values reduce overhead at the cost of sparser profiles."),
+    cl::init(3));
+
 cl::opt<bool> IterativeCounterPromotion(
     "iterative-counter-promotion", cl::init(true),
     cl::desc("Allow counter promotion across the whole loop nest."));
@@ -241,6 +254,20 @@ static bool profDataReferencedByCode(const Module &M) {
   return enablesValueProfiling(M);
 }
 
+// Extract CUID (Compilation Unit ID) from the module.
+// HIP/CUDA modules have a global variable __hip_cuid_<hash> that uniquely
+// identifies each translation unit. Returns empty string if not found.
+static std::string getCUIDFromModule(const Module &M) {
+  for (const GlobalVariable &GV : M.globals()) {
+    StringRef Name = GV.getName();
+    if (Name.starts_with("__hip_cuid_")) {
+      // Extract the hash suffix after "__hip_cuid_"
+      return Name.drop_front(strlen("__hip_cuid_")).str();
+    }
+  }
+  return "";
+}
+
 class InstrLowerer final {
 public:
   InstrLowerer(Module &M, const InstrProfOptions &Options,
@@ -265,7 +292,8 @@ class InstrLowerer final {
   struct PerFunctionProfileData {
     uint32_t NumValueSites[IPVK_Last + 1] = {};
     GlobalVariable *RegionCounters = nullptr;
-    GlobalVariable *DataVar = nullptr;
+    GlobalVariable *UniformCounters = nullptr; // For AMDGPU divergence tracking
+    GlobalValue *DataVar = nullptr;
     GlobalVariable *RegionBitmaps = nullptr;
     uint32_t NumBitmapBytes = 0;
 
@@ -287,11 +315,25 @@ class InstrLowerer final {
   GlobalVariable *NamesVar = nullptr;
   size_t NamesSize = 0;
 
+  StructType *ProfileDataTy = nullptr;
+  std::string CachedCUID; // CUID cached for consistent section naming
+
   // vector of counter load/store pairs to be register promoted.
   std::vector<LoadStorePair> PromotionCandidates;
 
   int64_t TotalCountersPromoted = 0;
 
+  // Per-function cache of invariant values for AMDGPU PGO instrumentation.
+  // Computed once at the function entry and reused across all instrumentation
+  // points to avoid redundant IR and help the optimizer.
+  struct AMDGPUPGOInvariants {
+    Value *Matched = nullptr;
+  };
+  DenseMap<Function *, AMDGPUPGOInvariants> AMDGPUInvariantsCache;
+
+  /// Emit invariant PGO values at the function entry block and cache them.
+  AMDGPUPGOInvariants &getOrCreateAMDGPUInvariants(Function *F);
+
   /// Lower instrumentation intrinsics in the function. Returns true if there
   /// any lowering.
   bool lowerIntrinsics(Function *F);
@@ -324,6 +366,9 @@ class InstrLowerer final {
   /// Replace instrprof.increment with an increment of the appropriate value.
   void lowerIncrement(InstrProfIncrementInst *Inc);
 
+  /// AMDGPU specific implementation of lowerIncrement.
+  void lowerIncrementAMDGPU(InstrProfIncrementInst *Inc);
+
   /// Force emitting of name vars for unused functions.
   void lowerCoverageData(GlobalVariable *CoverageNamesVar);
 
@@ -348,6 +393,10 @@ class InstrLowerer final {
   /// referring to them will also be created.
   GlobalVariable *getOrCreateRegionCounters(InstrProfCntrInstBase *Inc);
 
+  /// Get the uniform entry counters for AMDGPU divergence tracking.
+  /// These counters track how often blocks are entered with all lanes active.
+  GlobalVariable *getOrCreateUniformCounters(InstrProfCntrInstBase *Inc);
+
   /// Create the region counters.
   GlobalVariable *createRegionCounters(InstrProfCntrInstBase *Inc,
                                        StringRef Name,
@@ -407,6 +456,25 @@ class InstrLowerer final {
   /// Create a static initializer for our data, on platforms that need it,
   /// and for any profile output file that was specified.
   void emitInitialization();
+
+  /// For GPU targets: cache the CUID for consistent section naming.
+  void cacheGPUCUID();
+
+  /// Return the __llvm_profile_data struct type.
+  StructType *getProfileDataTy();
+
+  /// Create __llvm_offload_prf structure for GPU targets.
+  /// All sections use linker-defined __start_/__stop_ bounds.
+  void createProfileSectionSymbols();
+
+  /// Create HIP device variable registration for profile symbols
+  void createHIPDeviceVariableRegistration();
+
+  /// Create HIP dynamic module registration call
+  void createHIPDynamicModuleRegistration();
+
+  /// Create HIP dynamic module unregistration call
+  void createHIPDynamicModuleUnregistration();
 };
 
 ///
@@ -938,6 +1006,8 @@ bool InstrLowerer::lower() {
   if (!ContainsProfiling && !CoverageNamesVar)
     return MadeChange;
 
+  cacheGPUCUID();
+
   // We did not know how many value sites there would be inside
   // the instrumented function. This is counting the number of instrumented
   // target value sites to enter it as field in the profile data variable.
@@ -986,6 +1056,16 @@ bool InstrLowerer::lower() {
   emitNameData();
   emitVTableNames();
 
+  // Create start/stop symbols for device code profile sections
+  createProfileSectionSymbols();
+
+  // Create host shadow variables and registration calls for HIP device profile
+  // symbols
+  createHIPDeviceVariableRegistration();
+
+  createHIPDynamicModuleRegistration();
+  createHIPDynamicModuleUnregistration();
+
   // Emit runtime hook for the cases where the target does not unconditionally
   // require pulling in profile runtime, and coverage is enabled on code that is
   // not eliminated by the front-end, e.g. unused functions with internal
@@ -1045,7 +1125,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   assert(It != ProfileDataMap.end() && It->second.DataVar &&
          "value profiling detected in function with no counter increment");
 
-  GlobalVariable *DataVar = It->second.DataVar;
+  GlobalValue *DataVar = It->second.DataVar;
   uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
   uint64_t Index = Ind->getIndex()->getZExtValue();
   for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind)
@@ -1057,7 +1137,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   CallInst *Call = nullptr;
   auto *TLI = &GetTLI(*Ind->getFunction());
   auto *NormalizedDataVarPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-      DataVar, PointerType::get(M.getContext(), 0));
+      cast<Constant>(DataVar), PointerType::get(M.getContext(), 0));
 
   // To support value profiling calls within Windows exception handlers, funclet
   // information contained within operand bundles needs to be copied over to
@@ -1107,6 +1187,8 @@ GlobalVariable *InstrLowerer::getOrCreateBiasVar(StringRef VarName) {
 }
 
 Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) {
+  // Note: For AMDGPU targets, lowerIncrementAMDGPU handles counter addressing
+  // directly. This function is called for non-AMDGPU targets.
   auto *Counters = getOrCreateRegionCounters(I);
   IRBuilder<> Builder(I);
 
@@ -1189,6 +1271,10 @@ void InstrLowerer::lowerTimestamp(
 }
 
 void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
+  if (TT.isAMDGPU()) {
+    lowerIncrementAMDGPU(Inc);
+    return;
+  }
   auto *Addr = getCounterAddress(Inc);
 
   IRBuilder<> Builder(Inc);
@@ -1218,6 +1304,123 @@ void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
   Inc->eraseFromParent();
 }
 
+// Determine the wavefront size for an AMDGPU function.
+// Checks target-features attribute first (+wavefrontsize32/+wavefrontsize64),
+// then falls back to the default wavefront size for the target-cpu.
+// Returns 32 or 64. Defaults to 32 if undetermined.
+static unsigned getAMDGPUWavefrontSize(const Function &F) {
+  // Check target-features attribute for explicit wavefront size
+  StringRef Features = F.getFnAttribute("target-features").getValueAsString();
+  if (Features.contains("+wavefrontsize64"))
+    return 64;
+  if (Features.contains("+wavefrontsize32"))
+    return 32;
+
+  // Fall back to default wavefront size based on target-cpu
+  StringRef CPU = F.getFnAttribute("target-cpu").getValueAsString();
+  if (!CPU.empty()) {
+    AMDGPU::GPUKind Kind = AMDGPU::parseArchAMDGCN(CPU);
+    unsigned Features = AMDGPU::getArchAttrAMDGCN(Kind);
+    if (Features & AMDGPU::FEATURE_WAVE32)
+      return 32;
+    return 64; // gfx9 and older default to Wave64
+  }
+
+  return 32; // conservative default
+}
+
+InstrLowerer::AMDGPUPGOInvariants &
+InstrLowerer::getOrCreateAMDGPUInvariants(Function *F) {
+  auto It = AMDGPUInvariantsCache.find(F);
+  if (It != AMDGPUInvariantsCache.end())
+    return It->second;
+
+  LLVMContext &Context = M.getContext();
+  auto *Int32Ty = Type::getInt32Ty(Context);
+
+  BasicBlock &EntryBB = F->getEntryBlock();
+  IRBuilder<> Builder(&*EntryBB.getFirstInsertionPt());
+
+  Value *Matched = ConstantInt::getTrue(Context);
+  if (OffloadPGOSampling > 0) {
+    FunctionCallee IsSampledFn =
+        M.getOrInsertFunction("__llvm_profile_sampling_gpu", Int32Ty, Int32Ty);
+    Value *SampledInt = Builder.CreateCall(
+        IsSampledFn, {ConstantInt::get(Int32Ty, OffloadPGOSampling)},
+        "pgo.sampled");
+    Matched = Builder.CreateICmpNE(SampledInt, ConstantInt::get(Int32Ty, 0),
+                                   "pgo.matched");
+  }
+
+  auto &Inv = AMDGPUInvariantsCache[F];
+  Inv.Matched = Matched;
+  return Inv;
+}
+
+void InstrLowerer::lowerIncrementAMDGPU(InstrProfIncrementInst *Inc) {
+  Function *F = Inc->getFunction();
+  auto &Inv = getOrCreateAMDGPUInvariants(F);
+
+  IRBuilder<> Builder(Inc);
+  LLVMContext &Context = M.getContext();
+  auto *Int64Ty = Type::getInt64Ty(Context);
+
+  Value *Matched = Inv.Matched;
+
+  auto *CounterIdx = Inc->getIndex();
+
+  // --- Counter address ---
+  GlobalVariable *Counters = getOrCreateRegionCounters(Inc);
+  Value *Indices[] = {Builder.getInt32(0), CounterIdx};
+  Value *Addr = Builder.CreateInBoundsGEP(Counters->getValueType(), Counters,
+                                          Indices, "ctr.addr");
+
+  GlobalVariable *UniformCounters = getOrCreateUniformCounters(Inc);
+  Value *UniformAddr = nullptr;
+  if (UniformCounters) {
+    Value *UniformIndices[] = {Builder.getInt32(0), CounterIdx};
+    UniformAddr = Builder.CreateInBoundsGEP(UniformCounters->getValueType(),
+                                            UniformCounters, UniformIndices,
+                                            "unifctr.addr");
+  }
+
+  auto *PtrTy = PointerType::getUnqual(Context);
+  Value *UniformAddrArg =
+      UniformAddr
+          ? Builder.CreatePointerBitCastOrAddrSpaceCast(UniformAddr, PtrTy)
+          : ConstantPointerNull::get(cast<PointerType>(PtrTy));
+  Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy);
+
+  Value *IncStep = Inc->getStep();
+  Value *StepI64 = Builder.CreateZExtOrTrunc(IncStep, Int64Ty, "step.i64");
+
+  auto *CalleeTy = FunctionType::get(Type::getVoidTy(Context),
+                                     {PtrTy, PtrTy, Int64Ty}, false);
+  FunctionCallee IncrFn =
+      M.getOrInsertFunction("__llvm_profile_instrument_gpu", CalleeTy);
+
+  if (OffloadPGOSampling > 0) {
+    BasicBlock *CurBB = Builder.GetInsertBlock();
+    BasicBlock *ContBB =
+        CurBB->splitBasicBlock(BasicBlock::iterator(Inc), "po_cont");
+    BasicBlock *ThenBB = BasicBlock::Create(Context, "po_then", F);
+
+    CurBB->getTerminator()->eraseFromParent();
+    IRBuilder<> HeadBuilder(CurBB);
+    HeadBuilder.CreateCondBr(Matched, ThenBB, ContBB);
+
+    IRBuilder<> ThenBuilder(ThenBB);
+    ThenBuilder.CreateCall(IncrFn, {CastAddr, UniformAddrArg, StepI64});
+    ThenBuilder.CreateBr(ContBB);
+
+    Builder.SetInsertPoint(ContBB, ContBB->begin());
+  } else {
+    Builder.CreateCall(IncrFn, {CastAddr, UniformAddrArg, StepI64});
+  }
+
+  Inc->eraseFromParent();
+}
+
 void InstrLowerer::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
   ConstantArray *Names =
       cast<ConstantArray>(CoverageNamesVar->getInitializer());
@@ -1400,6 +1603,12 @@ static inline Constant *getFuncAddrForProfData(Function *Fn) {
   if (shouldUsePublicSymbol(Fn))
     return Fn;
 
+  // For GPU targets, weak functions cannot use private aliases because
+  // LTO may pick a different TU's copy, leaving the alias undefined
+  if (isGPUProfTarget(*Fn->getParent()) &&
+      GlobalValue::isWeakForLinker(Fn->getLinkage()))
+    return Fn;
+
   // When possible use a private alias to avoid symbolic relocations.
   auto *GA = GlobalAlias::create(GlobalValue::LinkageTypes::PrivateLinkage,
                                  Fn->getName() + ".local", Fn);
@@ -1623,11 +1832,15 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc,
   }
 
   Ptr->setVisibility(Visibility);
-  // Put the counters and bitmaps in their own sections so linkers can
-  // remove unneeded sections.
   Ptr->setSection(getInstrProfSectionName(IPSK, TT.getObjectFormat()));
   Ptr->setLinkage(Linkage);
-  maybeSetComdat(Ptr, Fn, VarName);
+  if (isGPUProfTarget(M) && !Ptr->hasComdat()) {
+    Ptr->setComdat(M.getOrInsertComdat(VarName));
+    Ptr->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    Ptr->setVisibility(GlobalValue::ProtectedVisibility);
+  } else {
+    maybeSetComdat(Ptr, Fn, VarName);
+  }
   return Ptr;
 }
 
@@ -1731,12 +1944,54 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
     CompilerUsedVars.push_back(PD.RegionCounters);
   }
 
+  // Create uniform counters before the data variable so that
+  // UniformCounterPtr can reference them in createDataVariable().
+  getOrCreateUniformCounters(Inc);
+
   // Create the data variable (if it doesn't already exist).
   createDataVariable(Inc);
 
   return PD.RegionCounters;
 }
 
+GlobalVariable *
+InstrLowerer::getOrCreateUniformCounters(InstrProfCntrInstBase *Inc) {
+  // Only create uniform counters for AMDGPU targets
+  if (!TT.isAMDGPU())
+    return nullptr;
+
+  GlobalVariable *NamePtr = Inc->getName();
+  auto &PD = ProfileDataMap[NamePtr];
+  if (PD.UniformCounters)
+    return PD.UniformCounters;
+
+  // Ensure RegionCounters exists first (we need the same size)
+  getOrCreateRegionCounters(Inc);
+
+  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+
+  auto &Ctx = M.getContext();
+  auto *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+
+  bool Renamed;
+  std::string VarName = getVarName(Inc, "__llvm_prf_unifcnt_", Renamed);
+
+  auto *GV = new GlobalVariable(M, CounterTy, false, NamePtr->getLinkage(),
+                                Constant::getNullValue(CounterTy), VarName);
+  GV->setAlignment(Align(8));
+
+  GV->setSection(getInstrProfSectionName(IPSK_ucnts, TT.getObjectFormat()));
+
+  GV->setComdat(M.getOrInsertComdat(VarName));
+  GV->setLinkage(GlobalValue::LinkOnceODRLinkage);
+  GV->setVisibility(GlobalValue::ProtectedVisibility);
+
+  PD.UniformCounters = GV;
+  CompilerUsedVars.push_back(GV);
+
+  return PD.UniformCounters;
+}
+
 void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   // When debug information is correlated to profile data, a data variable
   // is not needed.
@@ -1799,7 +2054,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   }
 
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
-  auto *CounterPtr = PD.RegionCounters;
+
+  Constant *CounterPtr = PD.RegionCounters;
+  Constant *UniformCounterPtr = PD.UniformCounters;
 
   uint64_t NumBitmapBytes = PD.NumBitmapBytes;
 
@@ -1807,11 +2064,7 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
   auto *Int16Ty = Type::getInt16Ty(Ctx);
   auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
-  Type *DataTypes[] = {
-#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
-#include "llvm/ProfileData/InstrProfData.inc"
-  };
-  auto *DataTy = StructType::get(Ctx, ArrayRef(DataTypes));
+  auto *DataTy = getProfileDataTy();
 
   Constant *FunctionAddr = getFuncAddrForProfData(Fn);
 
@@ -1819,6 +2072,19 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
     Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
 
+  uint16_t OffloadDeviceWaveSizeVal = 0;
+  if (TT.isAMDGPU())
+    OffloadDeviceWaveSizeVal = getAMDGPUWavefrontSize(*Fn);
+
+  if (isGPUProfTarget(M)) {
+    // For GPU targets, weak functions need weak linkage for their profile data
+    // aliases to allow linker deduplication across TUs
+    if (GlobalValue::isWeakForLinker(Fn->getLinkage()))
+      Linkage = Fn->getLinkage();
+    else
+      Linkage = GlobalValue::ExternalLinkage;
+    Visibility = GlobalValue::ProtectedVisibility;
+  }
   // If the data variable is not referenced by code (if we don't emit
   // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the
   // data variable live under linker GC, the data variable can be private. This
@@ -1830,7 +2096,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees
   // that other copies must have the same CFG and cannot have value profiling.
   // If no hash suffix, other profd copies may be referenced by code.
-  if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
+  if (!isGPUProfTarget(M) && NS == 0 &&
+      !(DataReferencedByCode && NeedComdat && !Renamed) &&
       (TT.isOSBinFormatELF() ||
        (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
     Linkage = GlobalValue::PrivateLinkage;
@@ -1843,7 +2110,11 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
     Visibility = GlobalValue::ProtectedVisibility;
   auto *Data =
       new GlobalVariable(M, DataTy, false, Linkage, nullptr, DataVarName);
+  GlobalValue *DataVar = Data;
+  Constant *DataAddr = Data;
+
   Constant *RelativeCounterPtr;
+  Constant *RelativeUniformCounterPtr = ConstantInt::get(IntPtrTy, 0);
   GlobalVariable *BitmapPtr = PD.RegionBitmaps;
   Constant *RelativeBitmapPtr = ConstantInt::get(IntPtrTy, 0);
   InstrProfSectKind DataSectionKind;
@@ -1854,10 +2125,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
     RelativeCounterPtr = ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy);
     if (BitmapPtr != nullptr)
       RelativeBitmapPtr = ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy);
+    if (UniformCounterPtr != nullptr)
+      RelativeUniformCounterPtr =
+          ConstantExpr::getPtrToInt(UniformCounterPtr, IntPtrTy);
   } else if (TT.isNVPTX()) {
-    // The NVPTX target cannot handle self-referencing constant expressions in
-    // global initializers at all. Use absolute pointers and have the runtime
-    // registration convert them to relative offsets.
     DataSectionKind = IPSK_data;
     RelativeCounterPtr = ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy);
   } else {
@@ -1866,29 +2137,40 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
     DataSectionKind = IPSK_data;
     RelativeCounterPtr =
         ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
-                             ConstantExpr::getPtrToInt(Data, IntPtrTy));
+                             ConstantExpr::getPtrToInt(DataAddr, IntPtrTy));
     if (BitmapPtr != nullptr)
       RelativeBitmapPtr =
           ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy),
-                               ConstantExpr::getPtrToInt(Data, IntPtrTy));
+                               ConstantExpr::getPtrToInt(DataAddr, IntPtrTy));
+    if (UniformCounterPtr != nullptr)
+      RelativeUniformCounterPtr = ConstantExpr::getSub(
+          ConstantExpr::getPtrToInt(UniformCounterPtr, IntPtrTy),
+          ConstantExpr::getPtrToInt(DataAddr, IntPtrTy));
   }
 
   Constant *DataVals[] = {
 #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
 #include "llvm/ProfileData/InstrProfData.inc"
   };
-  Data->setInitializer(ConstantStruct::get(DataTy, DataVals));
+  auto *DataInit = ConstantStruct::get(DataTy, DataVals);
 
-  Data->setVisibility(Visibility);
-  Data->setSection(
+  auto *DataGV = cast<GlobalVariable>(DataVar);
+  DataGV->setInitializer(DataInit);
+  DataGV->setVisibility(Visibility);
+  DataGV->setSection(
       getInstrProfSectionName(DataSectionKind, TT.getObjectFormat()));
-  Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
-  maybeSetComdat(Data, Fn, CntsVarName);
+  DataGV->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
+  if (isGPUProfTarget(M) && !DataGV->hasComdat()) {
+    DataGV->setComdat(M.getOrInsertComdat(CntsVarName));
+    DataGV->setLinkage(GlobalValue::LinkOnceODRLinkage);
+  } else {
+    maybeSetComdat(DataGV, Fn, CntsVarName);
+  }
 
-  PD.DataVar = Data;
+  PD.DataVar = DataVar;
 
   // Mark the data variable as used so that it isn't stripped out.
-  CompilerUsedVars.push_back(Data);
+  CompilerUsedVars.push_back(DataVar);
   // Now that the linkage set by the FE has been passed to the data and counter
   // variables, reset Name variable's linkage and visibility to private so that
   // it can be removed later by the compiler.
@@ -1948,6 +2230,102 @@ void InstrLowerer::emitVNodes() {
   UsedVars.push_back(VNodesVar);
 }
 
+void InstrLowerer::createHIPDynamicModuleRegistration() {
+  if (isGPUProfTarget(M))
+    return;
+  StringRef FuncNames[] = {"hipModuleLoad", "hipModuleLoadData",
+                           "hipModuleLoadDataEx"};
+  for (StringRef FuncName : FuncNames) {
+    Function *F = M.getFunction(FuncName);
+    if (!F)
+      continue;
+
+    for (User *U : F->users()) {
+      if (auto *CB = dyn_cast<CallBase>(U)) {
+        Instruction *InsertPt = nullptr;
+        // If the call is an invoke instruction, we should insert the
+        // registration call in the normal destination block.
+        if (auto *Invoke = dyn_cast<InvokeInst>(CB)) {
+          InsertPt = &*Invoke->getNormalDest()->getFirstInsertionPt();
+        } else if (CB->isTerminator()) {
+          // If it's another kind of terminator (e.g., callbr), we don't
+          // know the semantics of the successors, so we conservatively
+          // skip it. The hipModuleLoad* functions are not expected to be
+          // used in other terminator instructions.
+          continue;
+        } else {
+          // This is a normal call instruction, so we can insert after it.
+          InsertPt = CB->getNextNode();
+        }
+
+        // If there's no valid insertion point (e.g., a malformed block),
+        // skip.
+        if (!InsertPt)
+          continue;
+
+        IRBuilder<> Builder(InsertPt);
+        auto *VoidTy = Type::getVoidTy(M.getContext());
+        auto *VoidPtrTy = PointerType::getUnqual(M.getContext());
+        auto *Int32Ty = Type::getInt32Ty(M.getContext());
+        // register(int rc, void **modulePtr, const void *image)
+        auto *RegisterDynamicModuleTy =
+            FunctionType::get(VoidTy, {Int32Ty, VoidPtrTy, VoidPtrTy}, false);
+        FunctionCallee RegisterFunc = M.getOrInsertFunction(
+            "__llvm_profile_offload_register_dynamic_module",
+            RegisterDynamicModuleTy);
+
+        // Arg 0: return value of the hipModuleLoad* call (hipError_t / i32).
+        Value *ReturnValue = CB;
+        // Arg 1: module handle (out-parameter, hipModule_t*).
+        Value *ModuleHandle = CB->getArgOperand(0);
+        // Arg 2: code object image pointer.
+        // For hipModuleLoadData(module, image) and
+        // hipModuleLoadDataEx(module, image, ...), image is arg 1.
+        // For hipModuleLoad(module, fname), arg 1 is a filename — pass NULL.
+        Value *ImagePtr;
+        if (FuncName == "hipModuleLoad")
+          ImagePtr =
+              ConstantPointerNull::get(PointerType::getUnqual(M.getContext()));
+        else
+          ImagePtr = CB->getArgOperand(1);
+
+        Builder.CreateCall(RegisterFunc, {ReturnValue, ModuleHandle, ImagePtr});
+      }
+    }
+  }
+}
+
+void InstrLowerer::createHIPDynamicModuleUnregistration() {
+  Function *F = M.getFunction("hipModuleUnload");
+  if (!F)
+    return;
+
+  for (User *U : F->users()) {
+    if (auto *CB = dyn_cast_or_null<CallBase>(U)) {
+      // The insertion point is right before the call to hipModuleUnload.
+      Instruction *InsertPt = CB;
+
+      IRBuilder<> Builder(InsertPt);
+      auto *VoidTy = Type::getVoidTy(M.getContext());
+      auto *VoidPtrTy = PointerType::getUnqual(M.getContext());
+
+      auto *UnregisterDynamicModuleTy =
+          FunctionType::get(VoidTy, {VoidPtrTy}, false);
+      FunctionCallee UnregisterFunc = M.getOrInsertFunction(
+          "__llvm_profile_offload_unregister_dynamic_module",
+          UnregisterDynamicModuleTy);
+
+      // The argument is the module handle, which is the first
+      // argument to the hipModuleUnload call.
+      Value *ModuleHandle = CB->getArgOperand(0);
+      Value *CastedModuleHandle =
+          Builder.CreatePointerCast(ModuleHandle, VoidPtrTy);
+
+      Builder.CreateCall(UnregisterFunc, {CastedModuleHandle});
+    }
+  }
+}
+
 void InstrLowerer::emitNameData() {
   if (ReferencedNames.empty())
     return;
@@ -1961,16 +2339,23 @@ void InstrLowerer::emitNameData() {
   auto &Ctx = M.getContext();
   auto *NamesVal =
       ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false);
-  NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
-                                GlobalValue::PrivateLinkage, NamesVal,
-                                getInstrProfNamesVarName());
+  std::string NamesVarName = std::string(getInstrProfNamesVarName());
+  if (isGPUProfTarget(M)) {
+    std::string CUID = CachedCUID.empty() ? getCUIDFromModule(M) : CachedCUID;
+    if (!CUID.empty())
+      NamesVarName = NamesVarName + "_" + CUID;
+  }
+  NamesVar =
+      new GlobalVariable(M, NamesVal->getType(), true,
+                         GlobalValue::PrivateLinkage, NamesVal, NamesVarName);
 
   NamesSize = CompressedNameStr.size();
   setGlobalVariableLargeSection(TT, *NamesVar);
-  NamesVar->setSection(
+  std::string NamesSectionName =
       ProfileCorrelate == InstrProfCorrelator::BINARY
           ? getInstrProfSectionName(IPSK_covname, TT.getObjectFormat())
-          : getInstrProfSectionName(IPSK_name, TT.getObjectFormat()));
+          : getInstrProfSectionName(IPSK_name, TT.getObjectFormat());
+  NamesVar->setSection(NamesSectionName);
   // On COFF, it's important to reduce the alignment down to 1 to prevent the
   // linker from inserting padding before the start of the names section or
   // between names entries.
@@ -2179,3 +2564,187 @@ void createProfileSamplingVar(Module &M) {
   appendToCompilerUsed(M, SamplingVar);
 }
 } // namespace llvm
+
+namespace {
+
+// For GPU targets: Allocate contiguous arrays for all profile data.
+// This solves the linker reordering problem by using ONE symbol per section
+// type, so there's nothing for the linker to reorder.
+StructType *InstrLowerer::getProfileDataTy() {
+  if (ProfileDataTy)
+    return ProfileDataTy;
+
+  auto &Ctx = M.getContext();
+  auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+  auto *Int16Ty = Type::getInt16Ty(Ctx);
+  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
+  Type *DataTypes[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  ProfileDataTy = StructType::get(Ctx, ArrayRef(DataTypes));
+  return ProfileDataTy;
+}
+
+void InstrLowerer::cacheGPUCUID() {
+  if (!isGPUProfTarget(M))
+    return;
+  CachedCUID = getCUIDFromModule(M);
+}
+
+// Create CUID-suffixed pointer to __llvm_profile_sections for GPU targets.
+// The actual sections struct is defined in InstrProfilingPlatformGPU.c
+// (device-side runtime) using linker-defined __start_/__stop_ symbols.
+// We create a per-TU global that points to it, giving the host a unique
+// symbol for shadow variable registration.
+void InstrLowerer::createProfileSectionSymbols() {
+  if (!isGPUProfTarget(M) || CachedCUID.empty())
+    return;
+
+  auto &Ctx = M.getContext();
+  unsigned AS = M.getDataLayout().getDefaultGlobalsAddressSpace();
+  auto *Int8PtrTy = PointerType::get(Ctx, AS);
+
+  // __llvm_profile_sections is an array of 9 pointers defined in the GPU
+  // profile runtime (InstrProfilingPlatformGPU.c). Declare it as external.
+  auto *SectionsTy = ArrayType::get(Int8PtrTy, 9);
+  auto *SectionsGV = M.getGlobalVariable("__llvm_profile_sections");
+  if (!SectionsGV) {
+    SectionsGV = new GlobalVariable(M, SectionsTy, /*isConstant=*/true,
+                                    GlobalValue::ExternalLinkage, nullptr,
+                                    "__llvm_profile_sections", nullptr,
+                                    GlobalValue::NotThreadLocal, AS);
+    SectionsGV->setVisibility(GlobalValue::HiddenVisibility);
+  }
+
+  // Create a CUID-suffixed global that stores a pointer to the sections
+  // struct. Aliases can't point to declarations, so we use a pointer global.
+  // The host reads through this indirection: hipGetSymbolAddress gives the
+  // pointer global's device address, then one DtoH copy yields the sections
+  // struct address, then another DtoH copy reads the actual sections.
+  auto *PtrTy = PointerType::get(Ctx, AS);
+  auto *PtrInit =
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(SectionsGV, PtrTy);
+  std::string PtrName = "__llvm_offload_prf_" + CachedCUID;
+  auto *PtrGV = new GlobalVariable(
+      M, PtrTy, /*isConstant=*/true, GlobalValue::ExternalLinkage, PtrInit,
+      PtrName, nullptr, GlobalValue::NotThreadLocal, AS);
+  PtrGV->setVisibility(GlobalValue::DefaultVisibility);
+  CompilerUsedVars.push_back(PtrGV);
+}
+
+void InstrLowerer::createHIPDeviceVariableRegistration() {
+  if (isGPUProfTarget(M))
+    return;
+
+  std::string CUID = getCUIDFromModule(M);
+  if (CUID.empty())
+    return;
+
+  auto &Ctx = M.getContext();
+  auto *VoidTy = Type::getVoidTy(Ctx);
+  auto *VoidPtrTy = PointerType::getUnqual(Ctx);
+
+  std::string OffloadPrfName = "__llvm_offload_prf_" + CUID;
+  auto *OffloadPrfShadow = new GlobalVariable(
+      M, VoidPtrTy, /*isConstant=*/false, GlobalValue::ExternalLinkage,
+      ConstantPointerNull::get(cast<PointerType>(VoidPtrTy)), OffloadPrfName);
+  CompilerUsedVars.push_back(OffloadPrfShadow);
+
+  auto *RegisterShadowTy = FunctionType::get(VoidTy, {VoidPtrTy}, false);
+  FunctionCallee RegisterShadowFunc = M.getOrInsertFunction(
+      "__llvm_profile_offload_register_shadow_variable", RegisterShadowTy);
+
+  Function *Ctor = M.getFunction("__hip_module_ctor");
+  if (!Ctor) {
+    // RDC mode: no __hip_module_ctor per-TU. Emit an offloading entry so the
+    // linker wrapper generates __hipRegisterVar in the final module ctor.
+    llvm::offloading::emitOffloadingEntry(
+        M, llvm::object::OffloadKind::OFK_HIP, OffloadPrfShadow, OffloadPrfName,
+        M.getDataLayout().getPointerSize(),
+        llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
+
+    auto *CtorFn = Function::Create(FunctionType::get(VoidTy, false),
+                                    GlobalValue::InternalLinkage,
+                                    "__llvm_pgo_register_" + CUID, &M);
+    auto *Entry = BasicBlock::Create(Ctx, "entry", CtorFn);
+    IRBuilder<> B(Entry);
+    B.CreateCall(RegisterShadowFunc, {OffloadPrfShadow});
+    B.CreateRetVoid();
+    appendToGlobalCtors(M, CtorFn, 65535);
+    return;
+  }
+
+  // Locate the HIP fat-binary registration call and capture its return value
+  Value *Handle = nullptr;
+  for (BasicBlock &BB : *Ctor)
+    for (Instruction &I : BB)
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (Function *Callee = CB->getCalledFunction())
+          if (Callee->getName() == "__hipRegisterFatBinary") {
+            Handle = &I; // call result
+            break;
+          }
+  if (!Handle)
+    return;
+  GlobalVariable *FatbinHandleGV = nullptr;
+  if (auto *HandleInst = dyn_cast<Instruction>(Handle))
+    for (Instruction *Cur = HandleInst->getNextNode(); Cur;
+         Cur = Cur->getNextNode()) {
+      auto *SI = dyn_cast<StoreInst>(Cur);
+      if (!SI || SI->getValueOperand() != Handle)
+        continue;
+      if (auto *GV = dyn_cast<GlobalVariable>(
+              SI->getPointerOperand()->stripPointerCasts())) {
+        FatbinHandleGV = GV;
+        break;
+      }
+    }
+
+  if (!FatbinHandleGV) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "store of __hipRegisterFatBinary call not found\n");
+  }
+
+  // Insert the new registration just before the ctor’s return
+  ReturnInst *RetInst = nullptr;
+  for (auto &BB : llvm::reverse(*Ctor))
+    if ((RetInst = dyn_cast<ReturnInst>(BB.getTerminator())))
+      break;
+  if (!RetInst)
+    return;
+  IRBuilder<> Builder(RetInst);
+
+  LLVM_DEBUG(
+      llvm::dbgs() << "Found __hip_module_ctor, registering anchors for CUID="
+                   << CUID << "\n");
+
+  auto *Int32Ty = Type::getInt32Ty(Ctx);
+  auto *Int64Ty = Type::getInt64Ty(Ctx);
+  auto *RegisterVarTy =
+      FunctionType::get(VoidTy,
+                        {VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, Int32Ty,
+                         Int64Ty, Int32Ty, Int32Ty},
+                        false);
+  FunctionCallee RegisterVarFunc =
+      M.getOrInsertFunction("__hipRegisterVar", RegisterVarTy);
+  Value *HipHandle =
+      FatbinHandleGV ? Builder.CreateLoad(VoidPtrTy, FatbinHandleGV) : Handle;
+
+  auto *NameStr = ConstantDataArray::getString(Ctx, OffloadPrfName, true);
+  auto *NameGV = new GlobalVariable(M, NameStr->getType(), true,
+                                    GlobalValue::PrivateLinkage, NameStr,
+                                    OffloadPrfName + ".name");
+
+  Builder.CreateCall(RegisterVarFunc,
+                     {HipHandle, OffloadPrfShadow,
+                      Builder.CreatePointerCast(NameGV, VoidPtrTy),
+                      Builder.CreatePointerCast(NameGV, VoidPtrTy),
+                      Builder.getInt32(0),
+                      Builder.getInt64(M.getDataLayout().getPointerSize()),
+                      Builder.getInt32(0), Builder.getInt32(0)});
+
+  Builder.CreateCall(RegisterShadowFunc, {OffloadPrfShadow});
+}
+
+} // namespace
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index db032d6fcad45..3e8096d28817c 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -118,6 +118,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <cstdlib>
 #include <memory>
 #include <numeric>
 #include <optional>
@@ -379,8 +380,15 @@ class FunctionInstrumenter final {
   // values. Supporting other values is relatively straight-forward - just
   // another counter range within the context.
   bool isValueProfilingDisabled() const {
+    // Value profiling is disabled for GPU targets because the device-side
+    // profiling runtime does not yet implement
+    // __llvm_profile_instrument_target. The existing compiler-rt implementation
+    // uses a linked-list with locks and eviction policy that is not efficient
+    // for massively parallel GPU execution. A GPU-optimized implementation is
+    // left as future work.
     return DisableValueProfiling ||
-           InstrumentationType == PGOInstrumentationType::CTXPROF;
+           InstrumentationType == PGOInstrumentationType::CTXPROF ||
+           isGPUProfTarget(M);
   }
 
   bool shouldInstrumentEntryBB() const {
@@ -1198,6 +1206,9 @@ class PGOUseFunc {
   // Annotate the irreducible loop header weights.
   void annotateIrrLoopHeaderWeights();
 
+  // Annotate per-block uniformity info for offload profiling.
+  void setBlockUniformityAttribute();
+
   // The hotness of the function from the profile count.
   enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
 
@@ -1304,11 +1315,34 @@ bool PGOUseFunc::setInstrumentedCounts(
 
   setupBBInfoEdges(FuncInfo);
 
-  unsigned NumCounters =
-      InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
+  unsigned NumInstrumentedBBs = InstrumentBBs.size();
+  unsigned NumSelects = FuncInfo.SIVisitor.getNumOfSelectInsts();
+  unsigned NumCounters = NumInstrumentedBBs + NumSelects;
   // The number of counters here should match the number of counters
   // in profile. Return if they mismatch.
   if (NumCounters != CountFromProfile.size()) {
+    LLVM_DEBUG({
+      dbgs() << "PGO COUNTER MISMATCH for function " << F.getName() << ":\n";
+      dbgs() << "  Expected counters: " << NumCounters << "\n";
+      dbgs() << "    - From instrumented edges: " << NumInstrumentedBBs << "\n";
+      for (size_t i = 0; i < InstrumentBBs.size(); ++i) {
+        dbgs() << "      " << i << ": " << InstrumentBBs[i]->getName() << "\n";
+      }
+      dbgs() << "    - From select instructions: " << NumSelects << "\n";
+      dbgs() << "  Actual counters from profile: " << CountFromProfile.size()
+             << "\n";
+
+      // Dump module
+      std::error_code EC;
+      std::string Filename = "pgo_mismatch_" + F.getName().str() + ".ll";
+      raw_fd_ostream OS(Filename, EC);
+      if (!EC) {
+        dbgs() << "Dumping module to " << Filename << "\n";
+        M->print(OS, nullptr);
+      } else {
+        dbgs() << "Error opening file " << Filename << " for writing\n";
+      }
+    });
     return false;
   }
   auto *FuncEntry = &*F.begin();
@@ -1316,6 +1350,7 @@ bool PGOUseFunc::setInstrumentedCounts(
   // Set the profile count to the Instrumented BBs.
   uint32_t I = 0;
   for (BasicBlock *InstrBB : InstrumentBBs) {
+
     uint64_t CountValue = CountFromProfile[I++];
     PGOUseBBInfo &Info = getBBInfo(InstrBB);
     // If we reach here, we know that we have some nonzero count
@@ -1761,6 +1796,46 @@ void PGOUseFunc::annotateIrrLoopHeaderWeights() {
   }
 }
 
+void PGOUseFunc::setBlockUniformityAttribute() {
+  if (ProfileRecord.UniformityBits.empty())
+    return;
+
+  // Annotate uniformity on each instrumented IR basic block so later codegen
+  // passes (MachineFunction) can consume it without relying on fragile block
+  // numbering heuristics.
+  //
+  // Metadata kind: LLVMContext::MD_block_uniformity_profile
+  // Payload: i1 (true = uniform, false = divergent)
+
+  std::vector<BasicBlock *> InstrumentBBs;
+  FuncInfo.getInstrumentBBs(InstrumentBBs);
+
+  LLVMContext &Ctx = F.getContext();
+  Type *Int1Ty = Type::getInt1Ty(Ctx);
+
+  for (size_t I = 0, E = InstrumentBBs.size(); I < E; ++I) {
+    BasicBlock *BB = InstrumentBBs[I];
+    if (!BB || !BB->getTerminator())
+      continue;
+    bool IsUniform = ProfileRecord.isBlockUniform(I);
+    auto *MD = MDNode::get(
+        Ctx, ConstantAsMetadata::get(ConstantInt::get(Int1Ty, IsUniform)));
+    BB->getTerminator()->setMetadata(LLVMContext::MD_block_uniformity_profile,
+                                     MD);
+  }
+
+  // Keep a function attribute for debugging / IR inspection.
+  // Format: "U" for uniform, "D" for divergent, one per instrumented block.
+  std::string UniformityStr;
+  UniformityStr.reserve(InstrumentBBs.size());
+  for (size_t I = 0, E = InstrumentBBs.size(); I < E; ++I)
+    UniformityStr += ProfileRecord.isBlockUniform(I) ? 'U' : 'D';
+  F.addFnAttr("block-uniformity-profile", UniformityStr);
+
+  LLVM_DEBUG(dbgs() << "PGO: Set block uniformity profile for " << F.getName()
+                    << ": " << UniformityStr << "\n");
+}
+
 void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
   Module *M = F.getParent();
   IRBuilder<> Builder(&SI);
@@ -2272,6 +2347,7 @@ static bool annotateAllFunctions(
     Func.setBranchWeights();
     Func.annotateValueSites();
     Func.annotateIrrLoopHeaderWeights();
+    Func.setBlockUniformityAttribute();
     PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
     if (FreqAttr == PGOUseFunc::FFA_Cold)
       ColdFunctions.push_back(&F);
@@ -2406,14 +2482,14 @@ void llvm::setProfMetadata(Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
                            uint64_t MaxCount) {
   auto Weights = downscaleWeights(EdgeCounts, MaxCount);
 
-  LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W
-                                           : Weights) {
+  LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W : Weights) {
     dbgs() << W << " ";
-  } dbgs() << "\n";);
+  } dbgs() << "\n");
 
   misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
 
   setBranchWeights(*TI, Weights, /*IsExpected=*/false);
+
   if (EmitBranchProbability) {
     std::string BrCondStr = getBranchCondString(TI);
     if (BrCondStr.empty())
diff --git a/llvm/test/CodeGen/AArch64/global-merge-profile-sections.ll b/llvm/test/CodeGen/AArch64/global-merge-profile-sections.ll
index 7108d82ddaa2e..37e5295c91aee 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-profile-sections.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-profile-sections.ll
@@ -5,7 +5,7 @@ $__profc_begin = comdat nodeduplicate
 $__profc_end = comdat nodeduplicate
 
 @__profc_begin = private global [2 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
- at __profd_begin = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 -1301828029439649651, i64 172590168, i64 sub (i64 ptrtoint (ptr @__profc_begin to i64), i64 ptrtoint (ptr @__profd_begin to i64)), i64 0, ptr null, ptr null, i32 2, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc_begin), align 8
+ at __profd_begin = private global { i64, i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 -1301828029439649651, i64 172590168, i64 sub (i64 ptrtoint (ptr @__profc_begin to i64), i64 ptrtoint (ptr @__profd_begin to i64)), i64 0, i64 0, ptr null, ptr null, i32 2, [3 x i16] zeroinitializer, i16 0, i32 0 }, section "__llvm_prf_data", comdat($__profc_begin), align 8
 @__profc_end = private global [2 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
- at __profd_end = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 3274037854792712831, i64 172590168, i64 sub (i64 ptrtoint (ptr @__profc_end to i64), i64 ptrtoint (ptr @__profd_end to i64)), i64 0, ptr null, ptr null, i32 2, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc_end), align 8
+ at __profd_end = private global { i64, i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 3274037854792712831, i64 172590168, i64 sub (i64 ptrtoint (ptr @__profc_end to i64), i64 ptrtoint (ptr @__profd_end to i64)), i64 0, i64 0, ptr null, ptr null, i32 2, [3 x i16] zeroinitializer, i16 0, i32 0 }, section "__llvm_prf_data", comdat($__profc_end), align 8
 
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-3d-grid.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-3d-grid.ll
new file mode 100644
index 0000000000000..43adb1c30df88
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-3d-grid.ll
@@ -0,0 +1,24 @@
+;; Test that AMDGPU PGO instrumentation generates contiguous counter arrays
+;; and profile section symbols with CUID-based naming. The __llvm_profile_sampling_gpu
+;; library function handles 3D block linearization internally.
+
+; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s
+
+ at __hip_cuid_abcdef789 = addrspace(1) global i8 0
+ at __profn_kernel_3d = private constant [9 x i8] c"kernel_3d"
+
+define amdgpu_kernel void @kernel_3d() {
+  call void @llvm.instrprof.increment(ptr @__profn_kernel_3d, i64 12345, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
+
+;; Per-function comdat counters (3D grid linearization is handled in the runtime library)
+; CHECK: @__profc_kernel_3d = linkonce_odr protected addrspace(1) global [1 x i64]
+; CHECK: @__llvm_prf_unifcnt_kernel_3d = linkonce_odr protected addrspace(1) global [1 x i64]
+; CHECK: @__llvm_offload_prf_abcdef789 = addrspace(1) constant ptr addrspace(1) @__llvm_profile_sections
+
+;; Check sampling guard calls library function
+; CHECK: call i32 @__llvm_profile_sampling_gpu(i32 3)
+; CHECK: call void @__llvm_profile_instrument_gpu(ptr addrspacecast (ptr addrspace(1) @__profc_kernel_3d to ptr), ptr addrspacecast (ptr addrspace(1) @__llvm_prf_unifcnt_kernel_3d to ptr), i64 1)
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll
new file mode 100644
index 0000000000000..8ca0a7947bc5d
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll
@@ -0,0 +1,35 @@
+;; Test that AMDGPU targets use contiguous counter allocation with CUID-based naming.
+;; This avoids linker reordering issues where individual __profc_* symbols could be
+;; placed in any order within the section.
+
+; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s
+
+;; Simulate a module with CUID (as generated by HIP compilation)
+ at __hip_cuid_abc123 = addrspace(1) global i8 0
+
+ at __profn_kernel1 = private constant [7 x i8] c"kernel1"
+ at __profn_kernel2 = private constant [7 x i8] c"kernel2"
+
+;; Per-kernel counter arrays: kernel1 has 2 slots, kernel2 has 1 (section "__llvm_prf_cnts")
+; CHECK: @__profc_kernel1 = linkonce_odr protected addrspace(1) global [2 x i64] zeroinitializer, section "__llvm_prf_cnts"
+; CHECK: @__llvm_prf_unifcnt_kernel1 = linkonce_odr protected addrspace(1) global [2 x i64] zeroinitializer, section "__llvm_prf_ucnts"
+; CHECK: @__profc_kernel2 = linkonce_odr protected addrspace(1) global [1 x i64] zeroinitializer, section "__llvm_prf_cnts"
+
+define amdgpu_kernel void @kernel1() {
+  call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, i32 0)
+  call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, i32 1)
+  ret void
+}
+
+define amdgpu_kernel void @kernel2() {
+  call void @llvm.instrprof.increment(ptr @__profn_kernel2, i64 67890, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
+
+;; Registration symbol points at external section table (Joseph #187136 layout)
+; CHECK: @__llvm_offload_prf_abc123 = addrspace(1) constant ptr addrspace(1) @__llvm_profile_sections
+
+;; Second counter slot uses GEP into the same [2 x i64] arrays
+; CHECK: call void @__llvm_profile_instrument_gpu(ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ([2 x i64], ptr addrspace(1) @__profc_kernel1, i32 0, i32 1) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ([2 x i64], ptr addrspace(1) @__llvm_prf_unifcnt_kernel1, i32 0, i32 1) to ptr), i64 1)
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll
new file mode 100644
index 0000000000000..963e141934df1
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll
@@ -0,0 +1,57 @@
+;; Test AMDGPU PGO instrumentation lowering with multiple basic blocks.
+;; Verifies:
+;; 1. Sampling decision is computed once in the entry block.
+;; 2. Each instrumentation point calls __llvm_profile_instrument_gpu behind the
+;;    sampling guard branch.
+;; 3. No-sampling mode (sampling=0) calls __llvm_profile_instrument_gpu unconditionally.
+
+; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof \
+; RUN:   -offload-pgo-sampling=3 -S \
+; RUN:   | FileCheck %s --check-prefix=SAMPLED
+; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof \
+; RUN:   -offload-pgo-sampling=0 -S \
+; RUN:   | FileCheck %s --check-prefix=NOSAMPLE
+
+ at __hip_cuid_test01 = addrspace(1) global i8 0
+ at __profn_test_kernel = private constant [11 x i8] c"test_kernel"
+
+define amdgpu_kernel void @test_kernel(ptr addrspace(1) %out, i32 %n) {
+entry:
+  call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 111, i32 4, i32 0)
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 111, i32 4, i32 1)
+  store i32 1, ptr addrspace(1) %out
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
+
+;; ---- Sampled mode (sampling=3) ----
+
+;; Entry block: sampling decision computed once
+; SAMPLED-LABEL: define {{.*}} @test_kernel
+; SAMPLED: entry:
+; SAMPLED: %pgo.sampled = call i32 @__llvm_profile_sampling_gpu(i32 3)
+; SAMPLED: %pgo.matched = icmp ne i32 %pgo.sampled, 0
+; SAMPLED: br i1 %pgo.matched, label %po_then, label %po_cont
+
+;; Second instrumentation point reuses same sampling decision
+; SAMPLED: br i1 %pgo.matched, label %po_then{{[0-9]+}}, label %po_cont{{[0-9]+}}
+
+;; Both instrumentation points call the library function
+; SAMPLED: call void @__llvm_profile_instrument_gpu(
+; SAMPLED: call void @__llvm_profile_instrument_gpu(
+
+;; ---- No-sampling mode (sampling=0) ----
+
+;; No sampling guard — direct call
+; NOSAMPLE-LABEL: define {{.*}} @test_kernel
+; NOSAMPLE: entry:
+; NOSAMPLE-NOT: @__llvm_profile_sampling_gpu
+; NOSAMPLE: call void @__llvm_profile_instrument_gpu(
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-uniform-counters.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-uniform-counters.ll
new file mode 100644
index 0000000000000..ffdd41043bdbe
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-uniform-counters.ll
@@ -0,0 +1,22 @@
+;; Test that AMDGPU targets generate uniform counter arrays alongside regular
+;; counters. The uniform counter is passed to __llvm_profile_instrument_gpu which
+;; updates it when all lanes in the wave are active.
+
+; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s
+
+ at __hip_cuid_test123 = addrspace(1) global i8 0
+ at __profn_test_kernel = private constant [11 x i8] c"test_kernel"
+
+define amdgpu_kernel void @test_kernel() {
+  call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 12345, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
+
+;; Per-function counter + uniform-counter globals (comdat)
+; CHECK: @__profc_test_kernel = linkonce_odr protected addrspace(1) global [1 x i64]
+; CHECK: @__llvm_prf_unifcnt_test_kernel = linkonce_odr protected addrspace(1) global [1 x i64]
+
+;; __llvm_profile_instrument_gpu receives counter and uniform-counter bases
+; CHECK: call void @__llvm_profile_instrument_gpu(ptr addrspacecast (ptr addrspace(1) @__profc_test_kernel to ptr), ptr addrspacecast (ptr addrspace(1) @__llvm_prf_unifcnt_test_kernel to ptr), i64 1)
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave32.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave32.ll
new file mode 100644
index 0000000000000..457e98e7126f5
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave32.ll
@@ -0,0 +1,33 @@
+;; Test that AMDGPU PGO instrumentation generates library calls for Wave32.
+;; Verifies sampling guard and __llvm_profile_instrument_gpu call with correct
+;; addrspace(1) counter pointers.
+
+; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s
+
+ at __hip_cuid_abcdef456 = addrspace(1) global i8 0
+ at __profn_kernel_w32 = private constant [10 x i8] c"kernel_w32"
+
+define amdgpu_kernel void @kernel_w32() #0 {
+  call void @llvm.instrprof.increment(ptr @__profn_kernel_w32, i64 12345, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
+
+attributes #0 = { "target-cpu"="gfx1100" }
+
+;; Per-function comdat counters + uniform counters; wave size 32 in profile data
+; CHECK: @__profc_kernel_w32 = linkonce_odr protected addrspace(1) global [1 x i64]
+; CHECK: @__llvm_prf_unifcnt_kernel_w32 = linkonce_odr protected addrspace(1) global [1 x i64]
+; CHECK: @__profd_kernel_w32 = linkonce_odr protected addrspace(1) global { {{.*}} i16 32, i32 0 }
+; CHECK: @__llvm_profile_sections = external hidden addrspace(1) constant [9 x ptr addrspace(1)]
+; CHECK: @__llvm_offload_prf_abcdef456 = addrspace(1) constant ptr addrspace(1) @__llvm_profile_sections
+
+;; Check sampling guard (default sampling=3)
+; CHECK: %pgo.sampled = call i32 @__llvm_profile_sampling_gpu(i32 3)
+; CHECK: %pgo.matched = icmp ne i32 %pgo.sampled, 0
+; CHECK: br i1 %pgo.matched, label %po_then, label %po_cont
+
+;; Check library call
+; CHECK: po_then:
+; CHECK: call void @__llvm_profile_instrument_gpu(ptr addrspacecast (ptr addrspace(1) @__profc_kernel_w32 to ptr), ptr addrspacecast (ptr addrspace(1) @__llvm_prf_unifcnt_kernel_w32 to ptr), i64 1)
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave64.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave64.ll
new file mode 100644
index 0000000000000..9546e42f08de3
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave64.ll
@@ -0,0 +1,31 @@
+;; Test that AMDGPU PGO instrumentation generates library calls for Wave64.
+;; Wave64 targets (e.g., gfx908) should embed wave size 64 in profile data.
+
+; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s
+
+ at __hip_cuid_abcdef123 = addrspace(1) global i8 0
+ at __profn_kernel_w64 = private constant [10 x i8] c"kernel_w64"
+
+define amdgpu_kernel void @kernel_w64() #0 {
+  call void @llvm.instrprof.increment(ptr @__profn_kernel_w64, i64 12345, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
+
+attributes #0 = { "target-cpu"="gfx908" }
+
+;; Per-function comdat counters; wave size 64 in profile data (gfx908)
+; CHECK: @__profc_kernel_w64 = linkonce_odr protected addrspace(1) global [1 x i64]
+; CHECK: @__llvm_prf_unifcnt_kernel_w64 = linkonce_odr protected addrspace(1) global [1 x i64]
+; CHECK: @__profd_kernel_w64 = linkonce_odr protected addrspace(1) global { {{.*}} i16 64, i32 0 }
+; CHECK: @__llvm_offload_prf_abcdef123 = addrspace(1) constant ptr addrspace(1) @__llvm_profile_sections
+
+;; Check sampling guard
+; CHECK: %pgo.sampled = call i32 @__llvm_profile_sampling_gpu(i32 3)
+; CHECK: %pgo.matched = icmp ne i32 %pgo.sampled, 0
+; CHECK: br i1 %pgo.matched, label %po_then, label %po_cont
+
+;; Check library call
+; CHECK: po_then:
+; CHECK: call void @__llvm_profile_instrument_gpu(ptr addrspacecast (ptr addrspace(1) @__profc_kernel_w64 to ptr), ptr addrspacecast (ptr addrspace(1) @__llvm_prf_unifcnt_kernel_w64 to ptr), i64 1)
diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage.ll b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
index 08cbcaa962b76..695a8829fdf75 100644
--- a/llvm/test/Instrumentation/InstrProfiling/coverage.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
@@ -5,12 +5,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; CHECK: @__profc_foo = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
-; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
+; CHECK: @__profd_foo = private global { i64, i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
+; BINARY: @__profd_foo = private global { i64, i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
 @__profn_bar = private constant [3 x i8] c"bar"
 ; CHECK: @__profc_bar = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
-; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
+; CHECK: @__profd_bar = private global { i64, i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
+; BINARY: @__profd_bar = private global { i64, i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
 
 ; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names"
 ; BINARY: @__llvm_prf_nm ={{.*}} section "__llvm_covnames"
diff --git a/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll b/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll
new file mode 100644
index 0000000000000..ce16f1ee3215f
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -passes=instrprof -S | FileCheck %s
+
+; Test that weak functions on GPU targets get weak linkage for their
+; __profd_ aliases to allow linker deduplication across TUs.
+; Non-weak functions get external linkage (default for aliases).
+
+target triple = "amdgcn-amd-amdhsa"
+
+ at __hip_cuid_abc123 = addrspace(1) global i8 0
+
+; AMDGPU GPU profiling lowers to per-function comdat globals (not aliases).
+; CHECK: @__profd_weak_func = linkonce_odr protected addrspace(1) global
+ at __profn_weak_func = private constant [9 x i8] c"weak_func"
+
+define weak void @weak_func() {
+  call void @llvm.instrprof.increment(ptr @__profn_weak_func, i64 0, i32 1, i32 0)
+  ret void
+}
+
+; CHECK: @__profd_weak_odr_func = linkonce_odr protected addrspace(1) global
+ at __profn_weak_odr_func = private constant [13 x i8] c"weak_odr_func"
+
+define weak_odr void @weak_odr_func() {
+  call void @llvm.instrprof.increment(ptr @__profn_weak_odr_func, i64 0, i32 1, i32 0)
+  ret void
+}
+
+; CHECK: @__profd_normal_func = linkonce_odr protected addrspace(1) global
+ at __profn_normal_func = private constant [11 x i8] c"normal_func"
+
+define void @normal_func() {
+  call void @llvm.instrprof.increment(ptr @__profn_normal_func, i64 0, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
diff --git a/llvm/test/Instrumentation/InstrProfiling/inline-data-var-create.ll b/llvm/test/Instrumentation/InstrProfiling/inline-data-var-create.ll
index 456103164378e..8644c320f867b 100644
--- a/llvm/test/Instrumentation/InstrProfiling/inline-data-var-create.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/inline-data-var-create.ll
@@ -7,17 +7,18 @@
 
 target triple = "x86_64-unknown-linux-gnu"
 
-; INLINEFIRST: @__profd_foo = private global{{.*}}zeroinitializer, i32 21
-; INLINEFIRST: @__profd_bar = private global{{.*}}zeroinitializer, i32 23
-; INLINEFIRST: @__profd_foobar = private global{{.*}}zeroinitializer, i32 99
-
-; INLINEAFTER: @__profd_foobar = private global{{.*}}zeroinitializer, i32 99
-; INLINEAFTER: @__profd_foo = private global{{.*}}zeroinitializer, i32 21
-; INLINEAFTER: @__profd_bar = private global{{.*}}zeroinitializer, i32 23
-
-; NOINLINE: @__profd_foobar = private global{{.*}}zeroinitializer, i32 99
-; NOINLINE: @__profd_foo = private global{{.*}}zeroinitializer, i32 21
-; NOINLINE: @__profd_bar = private global{{.*}}zeroinitializer, i32 23
+;; Note: struct layout is { ..., i32, [3 x i16], i16, i32 } where the i16 is OffloadDeviceWaveSize
+; INLINEFIRST: @__profd_foo = private global{{.*}}i16 0, i32 21
+; INLINEFIRST: @__profd_bar = private global{{.*}}i16 0, i32 23
+; INLINEFIRST: @__profd_foobar = private global{{.*}}i16 0, i32 99
+
+; INLINEAFTER: @__profd_foobar = private global{{.*}}i16 0, i32 99
+; INLINEAFTER: @__profd_foo = private global{{.*}}i16 0, i32 21
+; INLINEAFTER: @__profd_bar = private global{{.*}}i16 0, i32 23
+
+; NOINLINE: @__profd_foobar = private global{{.*}}i16 0, i32 99
+; NOINLINE: @__profd_foo = private global{{.*}}i16 0, i32 21
+; NOINLINE: @__profd_bar = private global{{.*}}i16 0, i32 23
 
 declare void @llvm.instrprof.increment(ptr %0, i64 %1, i32 %2, i32 %3)
 declare void @llvm.instrprof.mcdc.parameters(ptr %0, i64 %1, i32 %2)
diff --git a/llvm/test/Instrumentation/InstrProfiling/platform.ll b/llvm/test/Instrumentation/InstrProfiling/platform.ll
index 9c76a5caf2a51..5f7eb47d2b611 100644
--- a/llvm/test/Instrumentation/InstrProfiling/platform.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/platform.ll
@@ -9,26 +9,35 @@
 ; RUN: opt < %s -mtriple=x86_64-pc-windows -passes=instrprof -S | FileCheck %s -check-prefix=WINDOWS
 ; RUN: opt < %s -mtriple=powerpc64-ibm-aix-xcoff -passes=instrprof -S | FileCheck %s -check-prefix=AIX
 ; RUN: opt < %s -mtriple=arm-elf -passes=instrprof -S | FileCheck %s -check-prefix=BAREMETAL
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s -check-prefix=AMDGPU
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; MACHO-NOT: __profn_foo
 ; ELF-NOT: __profn_foo
 ; WINDOWS-NOT: __profn_foo
 ; AIX-NOT: __profn_foo
+; AMDGPU-NOT: __profn_foo
 
 ; MACHO: @__profc_foo = private global [1 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
 ; ELF: @__profc_foo = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
 ; WINDOWS: @__profc_foo = private global [1 x i64] zeroinitializer, section ".lprfc$M", align 8
 ; AIX: @__profc_foo = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", align 8
+;; AMDGPU without CUID: COMDAT counters + uniform counter slot (HIP-style GPU profiling).
+; AMDGPU: $__llvm_prf_unifcnt_foo = comdat any
+; AMDGPU: @__profc_foo = linkonce_odr protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
+; AMDGPU: @__llvm_prf_unifcnt_foo = linkonce_odr protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_ucnts", comdat, align 8
 
 ; MACHO: @__profd_foo = private {{.*}}, section "__DATA,__llvm_prf_data,regular,live_support", align 8
 ; ELF: @__profd_foo = private {{.*}}, section "__llvm_prf_data", comdat($__profc_foo), align 8
 ; WINDOWS: @__profd_foo = private global {{.*}}, section ".lprfd$M", align 8
 ; AIX: @__profd_foo = private {{.*}}, section "__llvm_prf_data", align 8
+;; AMDGPU without CUID uses per-function data (not alias)
+; AMDGPU: @__profd_foo = linkonce_odr protected addrspace(1) global {{.*}}, section "__llvm_prf_data", comdat($__profc_foo), align 8
 
 ; ELF: @__llvm_prf_nm = private constant [{{.*}} x i8] c"{{.*}}", section "{{.*}}__llvm_prf_names"{{.*}}, align 1
 ; WINDOWS: @__llvm_prf_nm = private constant [{{.*}} x i8] c"{{.*}}", section "{{.*}}lprfn$M", align 1
 ; AIX: @__llvm_prf_nm = private constant [{{.*}} x i8] c"{{.*}}", section "{{.*}}__llvm_prf_names", align 1
+; AMDGPU: @__llvm_prf_nm = private addrspace(1) constant [{{.*}} x i8] c"{{.*}}", section "__llvm_prf_names", align 1
 
 define void @foo() {
   call void @llvm.instrprof.increment(ptr @__profn_foo, i64 0, i32 1, i32 0)
@@ -37,6 +46,9 @@ define void @foo() {
 
 declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
 
+;; AMDGPU without CUID uses standard per-function allocation (for OpenMP compatibility)
+;; Start/stop symbols behavior is platform-specific
+
 ;; Emit registration functions for platforms that don't find the
 ;; symbols by their sections.
 
@@ -48,6 +60,7 @@ declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
 ; WINDOWS-NOT: define internal void @__llvm_profile_register_functions
 ; AIX-NOT: define internal void @__llvm_profile_register_functions
 ; BAREMETAL-NOT: define internal void @__llvm_profile_register_functions
+; AMDGPU-NOT: define internal void @__llvm_profile_register_functions
 
 ;; PR38340: When dynamic registration is used, we had a bug where we'd register
 ;; something that's not a __profd_* variable.
@@ -60,3 +73,4 @@ declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
 ; WINDOWS-NOT: define internal void @__llvm_profile_init
 ; AIX-NOT: define internal void @__llvm_profile_init
 ; BAREMETAL-NOT: define internal void @__llvm_profile_init
+; AMDGPU-NOT: define internal void @__llvm_profile_init
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw
index 3daa98f937b691880ffff203c9426bfacddf749d..596ba3236108eedb5cf7546418d99f9d24f2c7d7 100644
GIT binary patch
delta 207
zcmZ3$a)X7lu_!ISs37M*_e4(ni4GiKB*7ODqoTHZ<^TT at H*Vc at 37eQ^#m4{z1}wKh
zBFnVo7W{_*m<)_|fT(z;rIvt0MF3a at 1H*+SAhRaxF)GO^;83~%taNf3<17V;xeN at 9
IC at w+p0h4(_hX4Qo

delta 174
zcmcb?vVeuNu_!ISs37M**F;YHi3tLH8J8YEkzV!x|K>Vr=_jER>#W!UR)JWP`52`p
zJ`fOF at gD*hpdbMzt0h-}CRG5IVqln|Cj&Adj!{X>0!_&Qi0*levlJi(F)%Pf4Te~Y
G-~#}Nbw4lw

diff --git a/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll
new file mode 100644
index 0000000000000..fc1c3c227bd05
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll
@@ -0,0 +1,23 @@
+;; Test that value profiling (indirect call profiling) is disabled for GPU targets.
+;; The device-side profiling runtime does not implement
+;; __llvm_profile_instrument_target, so indirect call profiling must not be emitted.
+
+; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+ at fptr = addrspace(1) global ptr null, align 8
+
+;; Verify that regular block instrumentation IS emitted
+; CHECK: call void @llvm.instrprof.increment
+
+;; Verify that value profiling for indirect calls is NOT emitted
+; CHECK-NOT: call void @llvm.instrprof.value.profile
+
+define amdgpu_kernel void @test_indirect_call() {
+entry:
+  %fp = load ptr, ptr addrspace(1) @fptr, align 8
+  call void %fp()
+  ret void
+}
diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
index 1bad0db1b4762..c948ad84392a0 100644
--- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
@@ -13,9 +13,9 @@ $foo = comdat any
 ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
 ; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
+; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, i64 0, ptr null
 ; CHECK-NOT: @foo
-; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
+; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i16 0, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.compiler.used
 
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
index 56d8364d8f543..7261cc24b0dcf 100644
--- a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
+++ b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
@@ -14,7 +14,7 @@ $__llvm_profile_raw_version = comdat any
 
 ; SAMPLE-VAR: @__llvm_profile_sampling = thread_local global i16 0, comdat
 ; SAMPLE-VAR: @__profc_f = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; SAMPLE-VAR: @__profd_f = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 -3706093650706652785, i64 12884901887, i64 sub (i64 ptrtoint (ptr @__profc_f to i64), i64 ptrtoint (ptr @__profd_f to i64)), i64 0, ptr @f.local, ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc_f), align 8
+; SAMPLE-VAR: @__profd_f = private global { i64, i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 -3706093650706652785, i64 12884901887, i64 sub (i64 ptrtoint (ptr @__profc_f to i64), i64 ptrtoint (ptr @__profd_f to i64)), i64 0, i64 0, ptr @f.local, ptr null, i32 1, [3 x i16] zeroinitializer, i16 0, i32 0 }, section "__llvm_prf_data", comdat($__profc_f), align 8
 ; SAMPLE-VAR: @__llvm_prf_nm = private constant {{.*}}, section "__llvm_prf_names", align 1
 ; SAMPLE-VAR: @llvm.compiler.used = appending global [2 x ptr] [ptr @__llvm_profile_sampling, ptr @__profd_f], section "llvm.metadata"
 ; SAMPLE-VAR: @llvm.used = appending global [1 x ptr] [ptr @__llvm_prf_nm], section "llvm.metadata"
diff --git a/llvm/test/Transforms/PGOProfile/vtable_profile.ll b/llvm/test/Transforms/PGOProfile/vtable_profile.ll
index aae1e2d8b4e49..0c554db05cfb4 100644
--- a/llvm/test/Transforms/PGOProfile/vtable_profile.ll
+++ b/llvm/test/Transforms/PGOProfile/vtable_profile.ll
@@ -49,7 +49,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @llvm.compiler.used = appending global [1 x ptr] [ptr @_ZTV5Base1], section "llvm.metadata"
 
 ; GEN: __llvm_profile_raw_version = comdat any
-; GEN: __llvm_profile_raw_version = hidden constant i64 72057594037927946, comdat
+; GEN: __llvm_profile_raw_version = hidden constant i64 72057594037927947, comdat
 ; GEN: __profn__Z4funci = private constant [8 x i8] c"_Z4funci"
 
 ; LOWER: $__profvt__ZTV7Derived = comdat nodeduplicate
diff --git a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw
index a3e884343942ebc70ba95ab4ee006630b6816d80..dec90151a79352bf898c8c9d1d2d23eaa4bf5eed 100644
GIT binary patch
delta 197
zcmeys|3ZMXu_!ISs37M*_e4(ni4GiKB*Ev<^}llG%K!gm%^2F;!Y0QtOHFJr0Lg8T
zXJQeUI8h$N{vgl9ZUA9VHei%zVt{c97!??2KqWQ+CD>j-IUj%$8j~+Fg0xRIV3KFz
n34rkmm=qWbV4Mv=DGmo1?*mXu0>&|5R$#mVF>Z4Ka{~(iCWJse

delta 160
zcmaDM at PVJRu_!ISs37M**F;YHi3tLH4>VUWO0D|;f9~Aye@{av)_G05AkX9?G4Z4P
zWC=!jCUpl8D*?#5-~nQ-U{qk-0b+ar3T+dZypTzLvIUa at V+5Ghz$DMKrUT480TkQ-
nW(hFMGgW{ECteVkoWU%>m;n}Fz^uS%0Afv?z_IxQa|H_kt<yS_

diff --git a/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw b/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw
index e3f77e870d4d20828119348e70eb44e6d39e0ec0..778e80fce2691a35d9654748763b0aa8c233ec13 100644
GIT binary patch
delta 197
zcmX at Wze9ktu_!ISs37M*_e4(ni4GiKB*EwKYmREqs{j9Gk0sqI4xJpwEH$yg03^3T
zo{2?Z;zW56`-40ay8(nf*?>`=i2=qbU{qk70hQPQlwf-S<$M52XiUDy2+}^;fJvT-
nCjiDPU{YW#fN?eer8pd5ybnMr2^hzKS%L8e#JJ4`%mFL_d2B$e

delta 159
zcmdlXaDbn)u_!ISs37M**F;YHi3tL{4=Vq*Em-yc|6E_D%J_-3-V-m#Gr34i{3t(J
zf>EAH-2uc(0J1K4fLJRS6&QDb7$1N_+XN;rWRjn3!KA<#0cJHY$uq6#05eYj1vh|M
m0?hJE6=1=M7X&6}FbgndfW;RuD=->>SQ963Z2rLP!2$q9VL7b;

diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
index cc3e6c38e6907..f20161f9886e3 100644
--- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test
+++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
@@ -18,7 +18,7 @@
 // whenever \12 (LF) is in the input string.
 UNSUPPORTED: system-windows
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\13\0\0\0\0\0\0\0' >> %t.profraw
 // There will be 2 20-byte binary IDs, so the total Binary IDs size will be 64 bytes.
 //   2 * 8  binary ID sizes
 // + 2 * 20 binary IDs (of size 20)
@@ -32,6 +32,9 @@ RUN: printf '\3\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\20\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -64,12 +67,14 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\310\377\3\0\1\0\0\0' >> %t.profraw
+RUN: printf '\270\377\3\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -81,8 +86,8 @@ RUN: printf '\067\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\101\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\7\0foo\1bar\0\0\0\0\0\0\0' >> %t.profraw
 
-// RUN: llvm-profdata show --binary-ids  %t.profraw | FileCheck %s
-// CHECK: Instrumentation level: Front-end
-// CHECK: Binary IDs:
-// CHECK-NEXT: 0001020304050607000102030405060700010203
-// CHECK-NEXT: 0101010101010101020202020202020203030303
+RUN: llvm-profdata show --binary-ids  %t.profraw | FileCheck %s
+CHECK: Instrumentation level: Front-end
+CHECK: Binary IDs:
+CHECK-NEXT: 0001020304050607000102030405060700010203
+CHECK-NEXT: 0101010101010101020202020202020203030303
diff --git a/llvm/test/tools/llvm-profdata/large-binary-id-size.test b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
index b62bdad4ddb21..4ffcc89c421ee 100644
--- a/llvm/test/tools/llvm-profdata/large-binary-id-size.test
+++ b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
@@ -2,7 +2,7 @@
 // whenever \12 (LF) is in the input string.
 UNSUPPORTED: system-windows
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\13\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\40\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -17,6 +17,9 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Check for a corrupted size being too large past the end of the file.
 RUN: printf '\7\7\7\7\7\7\7\7' >> %t.profraw
@@ -24,5 +27,5 @@ RUN: printf '\0\1\2\3\4\5\6\7' >> %t.profraw
 RUN: printf '\0\1\2\3\4\5\6\7' >> %t.profraw
 RUN: printf '\0\1\2\3\0\0\0\0' >> %t.profraw
 
-// RUN: not llvm-profdata show --binary-ids  %t.profraw 2>&1 | FileCheck %s
-// CHECK: malformed instrumentation profile data: not enough data to read binary id data
+RUN: not llvm-profdata show --binary-ids  %t.profraw 2>&1 | FileCheck %s
+CHECK: malformed instrumentation profile data: not enough data to read binary id data
diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
index 705e5efaf5875..4f9876e4d8fb2 100644
--- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
+++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
@@ -18,7 +18,7 @@
 // whenever \12 (LF) is in the input string.
 UNSUPPORTED: system-windows
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\13\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +26,9 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -46,10 +49,12 @@ RUN: printf '\254\275\030\333\114\302\370\134' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\4\0\3\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
index 157c13b926a7e..e7c2fa7877c1a 100644
--- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
+++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
@@ -18,7 +18,7 @@
 // whenever \12 (LF) is in the input string.
 UNSUPPORTED: system-windows
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\13\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +26,9 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -56,6 +59,7 @@ RUN: cp %t.profraw %t-good.profraw
 // Make NumCounters = 0 so that we get "number of counters is zero" error message
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t.profraw
@@ -64,12 +68,14 @@ RUN: not llvm-profdata show %t.profraw 2>&1 | FileCheck %s --check-prefix=ZERO
 ZERO: malformed instrumentation profile data: number of counters is zero
 
 // Test a counter value greater than 2^56.
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bad.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bad.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bad.profraw
 // Counter value is 72057594037927937
 RUN: printf '\1\0\0\0\0\0\0\1' >> %t-bad.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t-bad.profraw
 
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-good.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-good.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-good.profraw
 // Counter value is 72057594037927937
diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
index 83cf76f68fb63..ee4b63986e43f 100644
--- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
+++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
@@ -18,7 +18,7 @@
 // whenever \12 (LF) is in the input string.
 UNSUPPORTED: system-windows
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\13\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +26,9 @@ RUN: printf '\2\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\6\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -51,11 +54,12 @@ RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
 // the profile reader should error out.
 RUN: printf '\11\0\6\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-
+RUN: printf '\0\0\6\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Counter Section
 
diff --git a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
index 0f20a1b0b369c..b43b2408869c2 100644
--- a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
+++ b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
@@ -2,7 +2,7 @@
 // whenever \12 (LF) is in the input string.
 UNSUPPORTED: system-windows
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\13\0\0\0\0\0\0\0' >> %t.profraw
 // We should fail on this because the binary IDs is not a multiple of 8 bytes.
 RUN: printf '\77\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
index dfa163f1f3439..8d71914a99995 100644
--- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
+++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
@@ -11,6 +11,9 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/profile-version.test b/llvm/test/tools/llvm-profdata/profile-version.test
index e811699ac63ed..7f627cb89c7d0 100644
--- a/llvm/test/tools/llvm-profdata/profile-version.test
+++ b/llvm/test/tools/llvm-profdata/profile-version.test
@@ -2,7 +2,7 @@ Test the profile version.
 
 RUN: llvm-profdata merge -o %t.profdata %p/Inputs/basic.proftext
 RUN: llvm-profdata show --profile-version %t.profdata | FileCheck %s
-CHECK: Profile version: 13
+CHECK: Profile version: 14
 
 RUN: llvm-profdata merge -o %t.prev.profdata %p/Inputs/basic.proftext --write-prev-version
 RUN: llvm-profdata show --profile-version %t.prev.profdata | FileCheck %s --check-prefix=PREV
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
index 6b3dc96b36270..4f24eb624a2a7 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
@@ -3,7 +3,7 @@
 UNSUPPORTED: system-windows
 // Header
 RUN: printf '\377lprofR\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\12' >> %t
+RUN: printf '\0\0\0\0\0\0\0\13' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -11,6 +11,9 @@ RUN: printf '\0\0\0\0\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\4' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\0\1\0\0\0' >> %t
 RUN: printf '\0\0\0\0\3\0\0\0' >> %t
@@ -22,22 +25,28 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
 RUN: printf '\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
-RUN: printf '\0\377\377\330' >> %t
-RUN: printf '\2\377\377\323' >> %t
+RUN: printf '\0\377\377\320' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\2\377\377\313' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
index 95625565f5c0c..96db225b16b78 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
@@ -2,7 +2,7 @@
 // whenever \12 (LF) is in the input string.
 UNSUPPORTED: system-windows
 RUN: printf '\201Rforpl\377' > %t
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t
+RUN: printf '\13\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -10,6 +10,9 @@ RUN: printf '\3\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\4\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\20\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3\0\0\0\0' >> %t
@@ -21,22 +24,28 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
+RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
-RUN: printf '\330\377\377\0' >> %t
-RUN: printf '\323\377\377\2' >> %t
+RUN: printf '\320\377\377\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\313\377\377\2' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
index 5316ef5a3e559..8d29323ff472a 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
@@ -2,7 +2,7 @@
 // whenever \12 (LF) is in the input string.
 UNSUPPORTED: system-windows
 RUN: printf '\377lprofr\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\12' >> %t
+RUN: printf '\0\0\0\0\0\0\0\13' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -10,6 +10,9 @@ RUN: printf '\0\0\0\0\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\4' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\3\0\4\0\0' >> %t
@@ -21,22 +24,22 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
-RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\1' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\3' >> %t
+RUN: printf '\0\0\0\1\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\3' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
-RUN: printf '\0\0\0\0\0\0\0\02' >> %t
-RUN: printf '\0\0\0\1\0\3\377\310' >> %t
-RUN: printf '\0\0\0\3\0\3\377\303' >> %t
+RUN: printf '\0\0\0\0\0\0\0\2' >> %t
+RUN: printf '\0\0\0\1\0\3\377\300' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\3\0\3\377\273' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\02' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\1' >> %t
+RUN: printf '\0\0\0\2\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\1' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
index 58f4da8cf0386..d90c18cd31b12 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
@@ -2,7 +2,7 @@
 // whenever \12 (LF) is in the input string.
 UNSUPPORTED: system-windows
 RUN: printf '\201rforpl\377' > %t
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t
+RUN: printf '\13\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -10,6 +10,9 @@ RUN: printf '\3\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\4\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\20\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t
 RUN: printf '\0\0\4\0\3\0\0\0' >> %t
@@ -21,22 +24,22 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t
-RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\1\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\3\0\0\0' >> %t
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\3\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
-RUN: printf '\310\377\3\0\1\0\0\0' >> %t
-RUN: printf '\303\377\3\0\3\0\0\0' >> %t
+RUN: printf '\300\377\3\0\1\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\273\377\3\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\02\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\1\0\0\0' >> %t
+RUN: printf '\02\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\1\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-two-profiles.test b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
index 47cc6fa4fd7fe..e14e70e747a9a 100644
--- a/llvm/test/tools/llvm-profdata/raw-two-profiles.test
+++ b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
@@ -2,7 +2,7 @@
 // whenever \12 (LF) is in the input string.
 UNSUPPORTED: system-windows
 RUN: printf '\201rforpl\377' > %t-foo.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\13\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -10,6 +10,9 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -24,6 +27,7 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 
@@ -31,7 +35,7 @@ RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\201rforpl\377' > %t-bar.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\13\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
@@ -39,6 +43,9 @@ RUN: printf '\2\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
@@ -53,6 +60,7 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index ab67d75770fee..a19c30439c372 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Support/BalancedPartitioning.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Discriminator.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
@@ -825,6 +826,22 @@ loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
       I.Name = (*Remapper)(I.Name);
     const StringRef FuncName = I.Name;
     bool Reported = false;
+
+    if (!I.UniformCounts.empty()) {
+      size_t NumBlocks = I.Counts.size();
+      I.UniformityBits.resize((NumBlocks + 7) / 8, 0xFF);
+
+      for (size_t BlockIdx = 0; BlockIdx < NumBlocks; ++BlockIdx) {
+        uint64_t TotalCount = I.Counts[BlockIdx];
+        uint64_t UniformCount = I.UniformCounts[BlockIdx];
+
+        bool IsUniform =
+            (TotalCount == 0) || ((double)UniformCount / TotalCount >= 0.9);
+        if (!IsUniform)
+          I.UniformityBits[BlockIdx / 8] &= ~(1 << (BlockIdx % 8));
+      }
+    }
+
     WC->Writer.addRecord(std::move(I), Input.Weight, [&](Error E) {
       if (Reported) {
         consumeError(std::move(E));
@@ -2979,6 +2996,16 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
           OS << (I == Start ? "" : ", ") << Func.Counts[I];
         }
         OS << "]\n";
+
+        // Show uniformity bits if present
+        if (!Func.UniformityBits.empty()) {
+          OS << "    Block uniformity: [";
+          for (size_t I = Start, E = Func.Counts.size(); I < E; ++I) {
+            bool IsUniform = Func.isBlockUniform(I);
+            OS << (I == Start ? "" : ", ") << (IsUniform ? "U" : "D");
+          }
+          OS << "]\n";
+        }
       }
 
       if (ShowIndirectCallTargets) {
diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index fc8d6fe384754..cb46166d909b6 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -73,9 +73,11 @@ struct __llvm_profile_gpu_sections {
 
 extern "C" {
 extern int __attribute__((weak)) __llvm_write_custom_profile(
-    const char *Target, const __llvm_profile_data *DataBegin,
-    const __llvm_profile_data *DataEnd, const char *CountersBegin,
-    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd,
+    const char *Target, const char *TUSuffix,
+    const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
+    const char *CountersBegin, const char *CountersEnd,
+    const char *UniformCountersBegin, const char *UniformCountersEnd,
+    const char *NamesBegin, const char *NamesEnd,
     const uint64_t *VersionOverride);
 }
 /// PGO profiling data extracted from a GPU device via __llvm_profile_sections.
diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index 9216834b1e15e..bb4ad201f0c9c 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -291,11 +291,13 @@ Error GPUProfGlobals::write() const {
         reinterpret_cast<intptr_t>(Records[I].CounterPtr) + Adjustment);
 
   int Result = __llvm_write_custom_profile(
-      TargetTriple.str().c_str(),
+      TargetTriple.str().c_str(), /*TUSuffix=*/nullptr,
       reinterpret_cast<const __llvm_profile_data *>(DataBegin),
       reinterpret_cast<const __llvm_profile_data *>(DataBegin +
                                                     DataSection.size()),
-      CountersBegin, CountersBegin + CountersSection.size(), NamesBegin,
+      CountersBegin, CountersBegin + CountersSection.size(),
+      /*UniformCountersBegin=*/nullptr,
+      /*UniformCountersEnd=*/nullptr, NamesBegin,
       NamesBegin + NamesSection.size(), &Version);
   if (Result != 0)
     return Plugin::error(ErrorCode::HOST_IO,

>From ac0c78150c98b9913b9d8b54bc0518e4d55a0db8 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Thu, 2 Apr 2026 20:47:08 -0400
Subject: [PATCH 2/2] Address review comments from arsenm and jhuber6

- Remove getAMDGPUWavefrontSize and TargetParser dependency; use
  llvm.amdgcn.wavefrontsize intrinsic to store wave size at runtime
- Add PROFILE_INSTRUMENT_GPU and PROFILE_SAMPLING_GPU to RuntimeLibcalls
- Use consume_front and add external linkage check in getCUIDFromModule
- Change metadata separator to dot convention (block.uniformity.profile)
- Lowercase all error/warning messages in InstrProfilingPlatformROCm.c
- Fix InstrProfData.inc formatting, remove datalayout from test
- Restore deleted NVPTX/AMDGPU section comments
- Replace auto with explicit types in getOrCreateUniformCounters
---
 .../lib/profile/InstrProfilingPlatformGPU.c   |  5 ++
 .../lib/profile/InstrProfilingPlatformROCm.c  | 32 +++++-----
 llvm/include/llvm/IR/FixedMetadataKinds.def   |  2 +-
 llvm/include/llvm/IR/RuntimeLibcalls.td       | 13 +++-
 .../Instrumentation/InstrProfiling.cpp        | 62 +++++++++----------
 .../Instrumentation/PGOInstrumentation.cpp    |  2 +-
 .../InstrProfiling/amdgpu-wave32.ll           |  8 ++-
 .../InstrProfiling/amdgpu-wave64.ll           |  8 ++-
 .../amdgpu-disable-value-profiling.ll         |  1 -
 9 files changed, 74 insertions(+), 59 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
index e6a1395042107..8694bff2b365a 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
@@ -88,6 +88,7 @@ extern __llvm_profile_data PROF_DATA_STOP[] COMPILER_RT_VISIBILITY
 extern char PROF_UCNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_UCNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 
+// AMDGPU is a proper ELF target and exports the linker-defined section bounds.
 COMPILER_RT_GPU_VISIBILITY
 __llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
     PROF_NAME_START,  PROF_NAME_STOP,  PROF_CNTS_START,
@@ -96,6 +97,10 @@ __llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
 
 #elif defined(__NVPTX__)
 
+// NVPTX supports neither sections nor ELF symbols, we rely on the handling in
+// the 'InstrProfilingPlatformOther.c' file to fill this at initialization time.
+// FIXME: This will not work until we make the NVPTX backend emit section
+//        globals next to each other.
 COMPILER_RT_GPU_VISIBILITY
 __llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
     NULL, NULL, NULL,
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
index fed0949345c7e..72113f9aaaf40 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
@@ -293,7 +293,7 @@ static const void *UnwrapOffloadBundle(const void *Image) {
     }
   }
 
-  PROF_WARN("%s", "Offload bundle contains no valid ELF entries\n");
+  PROF_WARN("%s", "offload bundle contains no valid ELF entries\n");
   return NULL;
 }
 
@@ -362,12 +362,12 @@ static int RegisterPrfSymbol(const char *Name, void *UserData) {
   void *DevicePtrVar = NULL;
   size_t Bytes = 0;
   if (hipModuleGetGlobal(&DevicePtrVar, &Bytes, S->Module, Name) != 0) {
-    PROF_WARN("Failed to get symbol %s for module %p\n", Name, S->Module);
+    PROF_WARN("failed to get symbol %s for module %p\n", Name, S->Module);
     return 0; /* continue */
   }
   void *DeviceVar = NULL;
   if (hipMemcpy(&DeviceVar, DevicePtrVar, sizeof(void *), 2 /*DToH*/) != 0) {
-    PROF_WARN("Failed to read sections pointer for %s\n", Name);
+    PROF_WARN("failed to read sections pointer for %s\n", Name);
     return 0;
   }
 
@@ -377,7 +377,7 @@ static int RegisterPrfSymbol(const char *Name, void *UserData) {
     OffloadDynamicTUInfo *New = (OffloadDynamicTUInfo *)realloc(
         MI->TUs, NewCap * sizeof(OffloadDynamicTUInfo));
     if (!New) {
-      PROF_ERR("%s\n", "Failed to grow TU array");
+      PROF_ERR("%s\n", "failed to grow TU array");
       return 0;
     }
     MI->TUs = New;
@@ -441,7 +441,7 @@ void __llvm_profile_offload_register_dynamic_module(int ModuleLoadRc,
 #endif
 
   if (MI->NumTUs == 0) {
-    PROF_WARN("No __llvm_offload_prf_* symbols found in module %p\n", *Ptr);
+    PROF_WARN("no __llvm_offload_prf_* symbols found in module %p\n", *Ptr);
   } else if (IsVerboseMode()) {
     PROF_NOTE("Module %p: registered %d TU(s)\n", *Ptr, MI->NumTUs);
   }
@@ -475,7 +475,7 @@ void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) {
         if (ProcessDeviceOffloadPrf(TU->DeviceVar, TUIndex, ArchName) == 0)
           TU->Processed = 1;
         else
-          PROF_WARN("Failed to process profile data for module %p TU %d\n", Ptr,
+          PROF_WARN("failed to process profile data for module %p TU %d\n", Ptr,
                     t);
       }
     }
@@ -483,7 +483,7 @@ void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) {
   }
 
   if (IsVerboseMode())
-    PROF_WARN("Unregister called for unknown module %p\n", Ptr);
+    PROF_WARN("unregister called for unknown module %p\n", Ptr);
 }
 
 /* Grow a void* array, doubling capacity (or starting at InitCap). */
@@ -527,7 +527,7 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
 
   if (hipMemcpy(&HostSections, DeviceOffloadPrf, sizeof(HostSections),
                 2 /*DToH*/) != 0) {
-    PROF_ERR("%s\n", "Failed to copy offload prf structure from device");
+    PROF_ERR("%s\n", "failed to copy offload prf structure from device");
     return -1;
   }
 
@@ -627,7 +627,7 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
       (CountersSize > 0 && !HostCountersBegin) ||
       (NamesSize > 0 && !HostNamesBegin) ||
       (UniformCountersSize > 0 && !HostUniformCountersBegin)) {
-    PROF_ERR("%s\n", "Failed to allocate host memory for device sections");
+    PROF_ERR("%s\n", "failed to allocate host memory for device sections");
     goto cleanup;
   }
 
@@ -641,7 +641,7 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
       (UniformCountersSize > 0 && !UCntsReused &&
        memcpyDeviceToHost(HostUniformCountersBegin, DevUniformCntsBegin,
                           UniformCountersSize) != 0)) {
-    PROF_ERR("%s\n", "Failed to copy profile sections from device");
+    PROF_ERR("%s\n", "failed to copy profile sections from device");
     goto cleanup;
   }
 
@@ -688,7 +688,7 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
           &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
           &PaddingBytesAfterUniformCounters, &PaddingBytesAfterNames,
           &PaddingBytesAfterVTable, &PaddingBytesAfterVNames) != 0) {
-    PROF_ERR("%s\n", "Failed to get padding sizes");
+    PROF_ERR("%s\n", "failed to get padding sizes");
     goto cleanup;
   }
 
@@ -696,7 +696,7 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
       DataSize + PaddingBytesBeforeCounters + CountersSize + NamesSize;
   char *ContiguousBuffer = (char *)malloc(ContiguousBufferSize);
   if (!ContiguousBuffer) {
-    PROF_ERR("%s\n", "Failed to allocate contiguous buffer");
+    PROF_ERR("%s\n", "failed to allocate contiguous buffer");
     goto cleanup;
   }
   memset(ContiguousBuffer, 0, ContiguousBufferSize);
@@ -817,7 +817,7 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex,
   free(ReorderedUniformCounters);
 
   if (ret != 0) {
-    PROF_ERR("%s\n", "Failed to write device profile using shared API");
+    PROF_ERR("%s\n", "failed to write device profile using shared API");
   } else if (IsVerboseMode()) {
     PROF_NOTE("%s\n", "Successfully wrote device profile using shared API");
   }
@@ -838,7 +838,7 @@ static int ProcessShadowVariable(void *ShadowVar, int TUIndex,
                                  const char *Target) {
   void *DevicePtrVar = NULL;
   if (hipGetSymbolAddress(&DevicePtrVar, ShadowVar) != 0) {
-    PROF_WARN("Failed to get symbol address for shadow variable %p\n",
+    PROF_WARN("failed to get symbol address for shadow variable %p\n",
               ShadowVar);
     return -1;
   }
@@ -847,7 +847,7 @@ static int ProcessShadowVariable(void *ShadowVar, int TUIndex,
   void *DeviceOffloadPrf = NULL;
   if (hipMemcpy(&DeviceOffloadPrf, DevicePtrVar, sizeof(void *), 2 /*DToH*/) !=
       0) {
-    PROF_WARN("Failed to read sections pointer from shadow variable %p\n",
+    PROF_WARN("failed to read sections pointer from shadow variable %p\n",
               ShadowVar);
     return -1;
   }
@@ -904,7 +904,7 @@ int __llvm_profile_hip_collect_device_data(void) {
     OffloadDynamicModuleInfo *MI = &DynamicModules[i];
     for (int t = 0; t < MI->NumTUs; ++t) {
       if (!MI->TUs[t].Processed) {
-        PROF_WARN("Dynamic module %p TU %d was not processed before exit\n",
+        PROF_WARN("dynamic module %p TU %d was not processed before exit\n",
                   MI->ModulePtr, t);
         Ret = -1;
       }
diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def
index d238c81ecc152..38a97235849b9 100644
--- a/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -60,4 +60,4 @@ LLVM_FIXED_MD_KIND(MD_alloc_token, "alloc_token", 45)
 LLVM_FIXED_MD_KIND(MD_implicit_ref, "implicit.ref", 46)
 LLVM_FIXED_MD_KIND(MD_nofpclass, "nofpclass", 47)
 LLVM_FIXED_MD_KIND(MD_call_target, "call_target", 48)
-LLVM_FIXED_MD_KIND(MD_block_uniformity_profile, "block-uniformity-profile", 49)
+LLVM_FIXED_MD_KIND(MD_block_uniformity_profile, "block.uniformity.profile", 49)
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index a0f505f1fda2f..3e9c199314891 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -497,6 +497,10 @@ def DEOPTIMIZE : RuntimeLibcall;
 // Return address
 def RETURN_ADDRESS : RuntimeLibcall;
 
+// GPU profiling
+def PROFILE_INSTRUMENT_GPU : RuntimeLibcall;
+def PROFILE_SAMPLING_GPU : RuntimeLibcall;
+
 // Clear cache
 def CLEAR_CACHE : RuntimeLibcall;
 def RISCV_FLUSH_ICACHE : RuntimeLibcall;
@@ -2238,8 +2242,13 @@ def WindowsARM64ECSystemLibrary
 
 def isAMDGPU : RuntimeLibcallPredicate<"TT.isAMDGPU()">;
 
-// No calls.
-def AMDGPUSystemLibrary : SystemRuntimeLibrary<isAMDGPU, (add)>;
+def __llvm_profile_instrument_gpu : RuntimeLibcallImpl<PROFILE_INSTRUMENT_GPU>;
+def __llvm_profile_sampling_gpu : RuntimeLibcallImpl<PROFILE_SAMPLING_GPU>;
+
+def AMDGPUSystemLibrary : SystemRuntimeLibrary<isAMDGPU, (add
+  __llvm_profile_instrument_gpu,
+  __llvm_profile_sampling_gpu
+)>;
 
 //===----------------------------------------------------------------------===//
 // ARM Runtime Libcalls
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index c2fcfb4dfa6b9..50460fcc35c3d 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -54,7 +54,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/TargetParser/TargetParser.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -259,11 +258,11 @@ static bool profDataReferencedByCode(const Module &M) {
 // identifies each translation unit. Returns empty string if not found.
 static std::string getCUIDFromModule(const Module &M) {
   for (const GlobalVariable &GV : M.globals()) {
+    if (!GV.hasExternalLinkage())
+      continue;
     StringRef Name = GV.getName();
-    if (Name.starts_with("__hip_cuid_")) {
-      // Extract the hash suffix after "__hip_cuid_"
-      return Name.drop_front(strlen("__hip_cuid_")).str();
-    }
+    if (Name.consume_front("__hip_cuid_"))
+      return Name.str();
   }
   return "";
 }
@@ -328,6 +327,7 @@ class InstrLowerer final {
   // points to avoid redundant IR and help the optimizer.
   struct AMDGPUPGOInvariants {
     Value *Matched = nullptr;
+    bool WaveSizeStored = false;
   };
   DenseMap<Function *, AMDGPUPGOInvariants> AMDGPUInvariantsCache;
 
@@ -1304,31 +1304,6 @@ void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
   Inc->eraseFromParent();
 }
 
-// Determine the wavefront size for an AMDGPU function.
-// Checks target-features attribute first (+wavefrontsize32/+wavefrontsize64),
-// then falls back to the default wavefront size for the target-cpu.
-// Returns 32 or 64. Defaults to 32 if undetermined.
-static unsigned getAMDGPUWavefrontSize(const Function &F) {
-  // Check target-features attribute for explicit wavefront size
-  StringRef Features = F.getFnAttribute("target-features").getValueAsString();
-  if (Features.contains("+wavefrontsize64"))
-    return 64;
-  if (Features.contains("+wavefrontsize32"))
-    return 32;
-
-  // Fall back to default wavefront size based on target-cpu
-  StringRef CPU = F.getFnAttribute("target-cpu").getValueAsString();
-  if (!CPU.empty()) {
-    AMDGPU::GPUKind Kind = AMDGPU::parseArchAMDGCN(CPU);
-    unsigned Features = AMDGPU::getArchAttrAMDGCN(Kind);
-    if (Features & AMDGPU::FEATURE_WAVE32)
-      return 32;
-    return 64; // gfx9 and older default to Wave64
-  }
-
-  return 32; // conservative default
-}
-
 InstrLowerer::AMDGPUPGOInvariants &
 InstrLowerer::getOrCreateAMDGPUInvariants(Function *F) {
   auto It = AMDGPUInvariantsCache.find(F);
@@ -1371,6 +1346,27 @@ void InstrLowerer::lowerIncrementAMDGPU(InstrProfIncrementInst *Inc) {
 
   // --- Counter address ---
   GlobalVariable *Counters = getOrCreateRegionCounters(Inc);
+
+  // Store wavefront size into the profile data struct once per function.
+  // Uses the llvm.amdgcn.wavefrontsize intrinsic which the backend folds
+  // to a constant based on the actual subtarget.
+  if (!Inv.WaveSizeStored) {
+    Inv.WaveSizeStored = true;
+    GlobalVariable *NamePtr = Inc->getName();
+    auto &PD = ProfileDataMap[NamePtr];
+    if (PD.DataVar) {
+      IRBuilder<> EntryBuilder(&*F->getEntryBlock().getFirstInsertionPt());
+      Function *WaveSizeFn =
+          Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_wavefrontsize);
+      Value *WaveSize = EntryBuilder.CreateCall(WaveSizeFn);
+      Value *WaveSize16 = EntryBuilder.CreateTrunc(
+          WaveSize, Type::getInt16Ty(Context), "wavesize.i16");
+      Value *WaveSizeAddr = EntryBuilder.CreateStructGEP(
+          PD.DataVar->getValueType(), PD.DataVar, 9, "profd.wavesize");
+      EntryBuilder.CreateStore(WaveSize16, WaveSizeAddr);
+    }
+  }
+
   Value *Indices[] = {Builder.getInt32(0), CounterIdx};
   Value *Addr = Builder.CreateInBoundsGEP(Counters->getValueType(), Counters,
                                           Indices, "ctr.addr");
@@ -1970,8 +1966,8 @@ InstrLowerer::getOrCreateUniformCounters(InstrProfCntrInstBase *Inc) {
 
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
 
-  auto &Ctx = M.getContext();
-  auto *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+  LLVMContext &Ctx = M.getContext();
+  ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
 
   bool Renamed;
   std::string VarName = getVarName(Inc, "__llvm_prf_unifcnt_", Renamed);
@@ -2073,8 +2069,6 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
     Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
 
   uint16_t OffloadDeviceWaveSizeVal = 0;
-  if (TT.isAMDGPU())
-    OffloadDeviceWaveSizeVal = getAMDGPUWavefrontSize(*Fn);
 
   if (isGPUProfTarget(M)) {
     // For GPU targets, weak functions need weak linkage for their profile data
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 3e8096d28817c..38e53c749753d 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1830,7 +1830,7 @@ void PGOUseFunc::setBlockUniformityAttribute() {
   UniformityStr.reserve(InstrumentBBs.size());
   for (size_t I = 0, E = InstrumentBBs.size(); I < E; ++I)
     UniformityStr += ProfileRecord.isBlockUniform(I) ? 'U' : 'D';
-  F.addFnAttr("block-uniformity-profile", UniformityStr);
+  F.addFnAttr("block.uniformity.profile", UniformityStr);
 
   LLVM_DEBUG(dbgs() << "PGO: Set block uniformity profile for " << F.getName()
                     << ": " << UniformityStr << "\n");
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave32.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave32.ll
index 457e98e7126f5..1b9681880596c 100644
--- a/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave32.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave32.ll
@@ -16,13 +16,17 @@ declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
 
 attributes #0 = { "target-cpu"="gfx1100" }
 
-;; Per-function comdat counters + uniform counters; wave size 32 in profile data
+;; Per-function comdat counters + uniform counters
 ; CHECK: @__profc_kernel_w32 = linkonce_odr protected addrspace(1) global [1 x i64]
 ; CHECK: @__llvm_prf_unifcnt_kernel_w32 = linkonce_odr protected addrspace(1) global [1 x i64]
-; CHECK: @__profd_kernel_w32 = linkonce_odr protected addrspace(1) global { {{.*}} i16 32, i32 0 }
+; CHECK: @__profd_kernel_w32 = linkonce_odr protected addrspace(1) global { {{.*}} i16 0, i32 0 }
 ; CHECK: @__llvm_profile_sections = external hidden addrspace(1) constant [9 x ptr addrspace(1)]
 ; CHECK: @__llvm_offload_prf_abcdef456 = addrspace(1) constant ptr addrspace(1) @__llvm_profile_sections
 
+;; Check wave size stored via intrinsic
+; CHECK: %wavesize.i16 = trunc i32 %{{.*}} to i16
+; CHECK: store i16 %wavesize.i16, ptr addrspace(1) getelementptr inbounds {{.*}} @__profd_kernel_w32
+
 ;; Check sampling guard (default sampling=3)
 ; CHECK: %pgo.sampled = call i32 @__llvm_profile_sampling_gpu(i32 3)
 ; CHECK: %pgo.matched = icmp ne i32 %pgo.sampled, 0
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave64.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave64.ll
index 9546e42f08de3..37c0343d23d28 100644
--- a/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave64.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-wave64.ll
@@ -15,12 +15,16 @@ declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
 
 attributes #0 = { "target-cpu"="gfx908" }
 
-;; Per-function comdat counters; wave size 64 in profile data (gfx908)
+;; Per-function comdat counters
 ; CHECK: @__profc_kernel_w64 = linkonce_odr protected addrspace(1) global [1 x i64]
 ; CHECK: @__llvm_prf_unifcnt_kernel_w64 = linkonce_odr protected addrspace(1) global [1 x i64]
-; CHECK: @__profd_kernel_w64 = linkonce_odr protected addrspace(1) global { {{.*}} i16 64, i32 0 }
+; CHECK: @__profd_kernel_w64 = linkonce_odr protected addrspace(1) global { {{.*}} i16 0, i32 0 }
 ; CHECK: @__llvm_offload_prf_abcdef123 = addrspace(1) constant ptr addrspace(1) @__llvm_profile_sections
 
+;; Check wave size stored via intrinsic
+; CHECK: %wavesize.i16 = trunc i32 %{{.*}} to i16
+; CHECK: store i16 %wavesize.i16, ptr addrspace(1) getelementptr inbounds {{.*}} @__profd_kernel_w64
+
 ;; Check sampling guard
 ; CHECK: %pgo.sampled = call i32 @__llvm_profile_sampling_gpu(i32 3)
 ; CHECK: %pgo.matched = icmp ne i32 %pgo.sampled, 0
diff --git a/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll
index fc1c3c227bd05..21b1d68004b13 100644
--- a/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll
+++ b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll
@@ -4,7 +4,6 @@
 
 ; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s
 
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 target triple = "amdgcn-amd-amdhsa"
 
 @fptr = addrspace(1) global ptr null, align 8



More information about the llvm-commits mailing list