[clang] [compiler-rt] [llvm] [PGO][AMDGPU] Add offload profiling with uniformity-aware optimization (PR #177665)

Tue Feb 3 08:01:18 PST 2026

https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/177665

>From 733f8a34a3d35b6f772bd2be146ec5cbb63996f6 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Wed, 28 Jan 2026 10:33:29 -0500
Subject: [PATCH 1/5] [PGO][AMDGPU] Add offload profiling infrastructure for
 HIP

---
 compiler-rt/include/profile/InstrProfData.inc |    8 +-
 compiler-rt/lib/profile/CMakeLists.txt        |    1 +
 compiler-rt/lib/profile/InstrProfiling.h      |   20 +
 compiler-rt/lib/profile/InstrProfilingFile.c  |    2 +
 .../lib/profile/InstrProfilingInternal.h      |    7 +
 .../lib/profile/InstrProfilingPlatformROCm.c  |  567 ++++++++
 llvm/include/llvm/ProfileData/InstrProf.h     |   48 +-
 .../llvm/ProfileData/InstrProfData.inc        |   15 +-
 .../llvm/ProfileData/InstrProfWriter.h        |   10 +
 .../llvm/Transforms/Instrumentation/CFGMST.h  |   26 +-
 llvm/lib/Passes/StandardInstrumentations.cpp  |   10 +-
 llvm/lib/ProfileData/InstrProf.cpp            |  139 +-
 llvm/lib/ProfileData/InstrProfCorrelator.cpp  |    2 +
 llvm/lib/ProfileData/InstrProfReader.cpp      |  107 +-
 llvm/lib/ProfileData/InstrProfWriter.cpp      |   72 +-
 .../Instrumentation/InstrProfiling.cpp        | 1217 ++++++++++++++++-
 .../Instrumentation/PGOInstrumentation.cpp    |   80 +-
 .../amdgpu-contiguous-counters.ll             |   41 +
 .../InstrProfiling/amdgpu-uniform-counters.ll |   31 +
 .../InstrProfiling/coverage.ll                |    8 +-
 .../InstrProfiling/inline-data-var-create.ll  |   23 +-
 .../InstrProfiling/platform.ll                |   20 +-
 .../Transforms/PGOProfile/comdat_internal.ll  |    4 +-
 .../instrprof_burst_sampling_fast.ll          |    2 +-
 .../tools/llvm-profdata/profile-version.test  |    2 +-
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  125 +-
 26 files changed, 2470 insertions(+), 117 deletions(-)
 create mode 100644 compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/amdgpu-uniform-counters.ll

diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 46d6bb5bd8896..fffe5a26b1cb9 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -90,8 +90,12 @@ INSTR_PROF_DATA(IntPtrT, llvm::PointerType::getUnqual(Ctx), Values, \
 INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumCounters, \
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumCounters))
 INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last+1], \
-                ConstantArray::get(Int16ArrayTy, Int16ArrayVals)) \
-INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
+                ConstantArray::get(Int16ArrayTy, Int16ArrayVals))
+INSTR_PROF_DATA(const uint16_t, llvm::Type::getInt16Ty(Ctx),
+                NumOffloadProfilingThreads,
+                ConstantInt::get(llvm::Type::getInt16Ty(Ctx),
+                                 NumOffloadProfilingThreadsVal))
+INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes,
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumBitmapBytes))
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 7c8473cc5f200..d4f64dcb2c6c7 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -73,6 +73,7 @@ set(PROFILE_SOURCES
   InstrProfilingPlatformFuchsia.c
   InstrProfilingPlatformLinux.c
   InstrProfilingPlatformOther.c
+  InstrProfilingPlatformROCm.c
   InstrProfilingPlatformWindows.c
   )
 
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 187ef55ef3784..6ebeb4c2dc6c7 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -349,4 +349,24 @@ extern char INSTR_PROF_PROFILE_NAME_VAR[1]; /* __llvm_profile_filename. */
 
 const __llvm_gcov_init_func_struct *__llvm_profile_begin_covinit();
 const __llvm_gcov_init_func_struct *__llvm_profile_end_covinit();
+
+/* A struct to hold the device pointers and sizes for the profile sections. */
+typedef struct OffloadProfileSectionInfo {
+  void *CountersBegin;
+  size_t CountersSize;
+  void *DataBegin;
+  size_t DataSize;
+  void *NamesBegin;
+  size_t NamesSize;
+} OffloadProfileSectionInfo;
+
+/*!
+ * \brief Register an offload module's device-side profile data sections.
+ *
+ * This function is called by the host-side instrumentation code to provide
+ * the runtime with the necessary information to collect profile data from
+ * the device.
+ */
+void __llvm_profile_offload_register_module(OffloadProfileSectionInfo *Info);
+
 #endif /* PROFILE_INSTRPROFILING_H_ */
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 71127b05aafb8..aa9d567a1d17f 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -1198,6 +1198,8 @@ int __llvm_profile_write_file(void) {
   if (rc)
     PROF_ERR("Failed to write file \"%s\": %s\n", Filename, strerror(errno));
 
+  __llvm_profile_hip_collect_device_data();
+
   // Restore SIGKILL.
   if (PDeathSig == 1)
     lprofRestoreSigKill();
diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h
index 5647782527eb7..be6d2627dd100 100644
--- a/compiler-rt/lib/profile/InstrProfilingInternal.h
+++ b/compiler-rt/lib/profile/InstrProfilingInternal.h
@@ -212,5 +212,12 @@ int __llvm_write_binary_ids(ProfDataWriter *Writer);
 int lprofWriteOneBinaryId(ProfDataWriter *Writer, uint64_t BinaryIdLen,
                           const uint8_t *BinaryIdData,
                           uint64_t BinaryIdPadding);
+#ifdef __cplusplus
+extern "C" {
+#endif
+COMPILER_RT_VISIBILITY int __llvm_profile_hip_collect_device_data(void);
+#ifdef __cplusplus
+}
+#endif
 
 #endif
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
new file mode 100644
index 0000000000000..18c55036bbc24
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
@@ -0,0 +1,567 @@
+//===- InstrProfilingPlatformROCm.c - Profile data ROCm platform ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingPort.h"
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex);
+
+static int IsVerboseMode() {
+  static int IsVerbose = -1;
+  if (IsVerbose == -1)
+    IsVerbose = getenv("LLVM_PROFILE_VERBOSE") != NULL;
+  return IsVerbose;
+}
+
+/* -------------------------------------------------------------------------- */
+/*  Dynamic loading of HIP runtime symbols                                   */
+/* -------------------------------------------------------------------------- */
+
+typedef int (*hipMemcpyFromSymbolTy)(void *, const void *, size_t, size_t, int);
+typedef int (*hipGetSymbolAddressTy)(void **, const void *);
+typedef int (*hipMemcpyTy)(void *, void *, size_t, int);
+typedef int (*hipModuleGetGlobalTy)(void **, size_t *, void *, const char *);
+
+static hipMemcpyFromSymbolTy pHipMemcpyFromSymbol = NULL;
+static hipGetSymbolAddressTy pHipGetSymbolAddress = NULL;
+static hipMemcpyTy pHipMemcpy = NULL;
+static hipModuleGetGlobalTy pHipModuleGetGlobal = NULL;
+
+/* -------------------------------------------------------------------------- */
+/*  Device-to-host copies                                                     */
+/*  Keep HIP-only to avoid an HSA dependency.                                 */
+/* -------------------------------------------------------------------------- */
+
+static void EnsureHipLoaded(void) {
+  static int Initialized = 0;
+  if (Initialized)
+    return;
+  Initialized = 1;
+
+  void *Handle = dlopen("libamdhip64.so", RTLD_LAZY | RTLD_LOCAL);
+  if (!Handle) {
+    fprintf(stderr, "compiler-rt: failed to open libamdhip64.so: %s\n",
+            dlerror());
+    return;
+  }
+
+  pHipMemcpyFromSymbol =
+      (hipMemcpyFromSymbolTy)dlsym(Handle, "hipMemcpyFromSymbol");
+  pHipGetSymbolAddress =
+      (hipGetSymbolAddressTy)dlsym(Handle, "hipGetSymbolAddress");
+  pHipMemcpy = (hipMemcpyTy)dlsym(Handle, "hipMemcpy");
+  pHipModuleGetGlobal =
+      (hipModuleGetGlobalTy)dlsym(Handle, "hipModuleGetGlobal");
+}
+
+/* -------------------------------------------------------------------------- */
+/*  Public wrappers that forward to the loaded HIP symbols                   */
+/* -------------------------------------------------------------------------- */
+
+static int hipMemcpyFromSymbol(void *dst, const void *symbol, size_t sizeBytes,
+                               size_t offset, int kind) {
+  EnsureHipLoaded();
+  return pHipMemcpyFromSymbol
+             ? pHipMemcpyFromSymbol(dst, symbol, sizeBytes, offset, kind)
+             : -1;
+}
+
+static int hipGetSymbolAddress(void **devPtr, const void *symbol) {
+  EnsureHipLoaded();
+  return pHipGetSymbolAddress ? pHipGetSymbolAddress(devPtr, symbol) : -1;
+}
+
+static int hipMemcpy(void *dest, void *src, size_t len, int kind /*2=DToH*/) {
+  EnsureHipLoaded();
+  return pHipMemcpy ? pHipMemcpy(dest, src, len, kind) : -1;
+}
+
+/* Copy from device to host using HIP.
+ * This requires that the device section symbols are registered with CLR,
+ * otherwise hipMemcpy may attempt a CPU path and crash. */
+static int memcpyDeviceToHost(void *Dst, void *Src, size_t Size) {
+  return hipMemcpy(Dst, Src, Size, 2 /* DToH */);
+}
+
+static int hipModuleGetGlobal(void **DevPtr, size_t *Bytes, void *Module,
+                              const char *Name) {
+  EnsureHipLoaded();
+  return pHipModuleGetGlobal ? pHipModuleGetGlobal(DevPtr, Bytes, Module, Name)
+                             : -1;
+}
+
+/* -------------------------------------------------------------------------- */
+/*  Dynamic module tracking                                                   */
+/* -------------------------------------------------------------------------- */
+
+#define MAX_DYNAMIC_MODULES 256
+
+typedef struct {
+  void *ModulePtr; /* hipModule_t returned by hipModuleLoad            */
+  void *DeviceVar; /* address of __llvm_offload_prf in this module     */
+  int Processed;   /* 0 = not yet collected, 1 = data already copied   */
+} OffloadDynamicModuleInfo;
+
+static OffloadDynamicModuleInfo DynamicModules[MAX_DYNAMIC_MODULES];
+static int NumDynamicModules = 0;
+
+/* -------------------------------------------------------------------------- */
+/*  Registration / un-registration helpers                                   */
+/* -------------------------------------------------------------------------- */
+
+void __llvm_profile_offload_register_dynamic_module(int ModuleLoadRc,
+                                                    void **Ptr) {
+  if (IsVerboseMode())
+    PROF_NOTE("Registering loaded module %d: rc=%d, module=%p\n",
+              NumDynamicModules, ModuleLoadRc, *Ptr);
+
+  if (ModuleLoadRc)
+    return;
+
+  if (NumDynamicModules >= MAX_DYNAMIC_MODULES) {
+    PROF_ERR("Too many dynamic modules registered. Maximum is %d.\n",
+             MAX_DYNAMIC_MODULES);
+    return;
+  }
+
+  OffloadDynamicModuleInfo *Info = &DynamicModules[NumDynamicModules++];
+  Info->ModulePtr = *Ptr;
+  Info->DeviceVar = NULL;
+  Info->Processed = 0;
+
+  size_t Bytes = 0;
+  if (hipModuleGetGlobal(&Info->DeviceVar, &Bytes, *Ptr,
+                         "__llvm_offload_prf") != 0) {
+    PROF_WARN("Failed to get symbol __llvm_offload_prf for module %p\n", *Ptr);
+    /* Leave DeviceVar NULL so later code can recognise the failure */
+    return;
+  }
+
+  if (IsVerboseMode())
+    PROF_NOTE("Module %p: Device profile var %p\n", *Ptr, Info->DeviceVar);
+}
+
+void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) {
+  for (int i = 0; i < NumDynamicModules; ++i) {
+    OffloadDynamicModuleInfo *Info = &DynamicModules[i];
+
+    if (Info->ModulePtr == Ptr) {
+      if (IsVerboseMode())
+        PROF_NOTE("Unregistering module %p (DeviceVar=%p, Processed=%d)\n",
+                  Info->ModulePtr, Info->DeviceVar, Info->Processed);
+
+      if (Info->Processed) {
+        PROF_WARN("Module %p has already been unregistered or processed\n",
+                  Ptr);
+        return;
+      }
+
+      if (Info->DeviceVar) {
+        // Use module index as TU index for dynamic modules
+        // to ensure each module gets a unique profile file
+        if (ProcessDeviceOffloadPrf(Info->DeviceVar, i) == 0)
+          Info->Processed = 1;
+        else
+          PROF_WARN(
+              "Failed to process profile data for module %p on unregister\n",
+              Ptr);
+      } else {
+        PROF_WARN("Module %p has no device profile variable to process\n", Ptr);
+      }
+      return;
+    }
+  }
+
+  if (IsVerboseMode())
+    PROF_WARN("Unregister called for unknown module %p\n", Ptr);
+}
+
+#define MAX_SHADOW_VARIABLES 256
+static void *OffloadShadowVariables[MAX_SHADOW_VARIABLES];
+static int NumShadowVariables = 0;
+
+void __llvm_profile_offload_register_shadow_variable(void *ptr) {
+  if (NumShadowVariables >= MAX_SHADOW_VARIABLES) {
+    PROF_ERR("Too many shadow variables registered. Maximum is %d.\n",
+             MAX_SHADOW_VARIABLES);
+    return;
+  }
+  if (IsVerboseMode())
+    PROF_NOTE("Registering shadow variable %d: %p\n", NumShadowVariables, ptr);
+  OffloadShadowVariables[NumShadowVariables++] = ptr;
+}
+
+#define MAX_SECTION_SHADOW_VARIABLES 1024
+static void *OffloadSectionShadowVariables[MAX_SECTION_SHADOW_VARIABLES];
+static int NumSectionShadowVariables = 0;
+
+void __llvm_profile_offload_register_section_shadow_variable(void *ptr) {
+  if (NumSectionShadowVariables >= MAX_SECTION_SHADOW_VARIABLES) {
+    PROF_ERR("Too many section shadow variables registered. Maximum is %d.\n",
+             MAX_SECTION_SHADOW_VARIABLES);
+    return;
+  }
+  if (IsVerboseMode())
+    PROF_NOTE("Registering section shadow variable %d: %p\n",
+              NumSectionShadowVariables, ptr);
+  OffloadSectionShadowVariables[NumSectionShadowVariables++] = ptr;
+}
+
+static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex) {
+  void *HostOffloadPrf[8];
+
+  if (IsVerboseMode())
+    PROF_NOTE("HostOffloadPrf buffer size: %zu bytes\n",
+              sizeof(HostOffloadPrf));
+
+  if (hipMemcpy(HostOffloadPrf, DeviceOffloadPrf, sizeof(HostOffloadPrf),
+                2 /*DToH*/) != 0) {
+    PROF_ERR("%s\n", "Failed to copy offload prf structure from device");
+    return -1;
+  }
+
+  void *DevCntsBegin = HostOffloadPrf[0];
+  void *DevDataBegin = HostOffloadPrf[1];
+  void *DevNamesBegin = HostOffloadPrf[2];
+  void *DevUniformCntsBegin = HostOffloadPrf[3];
+  void *DevCntsEnd = HostOffloadPrf[4];
+  void *DevDataEnd = HostOffloadPrf[5];
+  void *DevNamesEnd = HostOffloadPrf[6];
+  void *DevUniformCntsEnd = HostOffloadPrf[7];
+
+  if (IsVerboseMode()) {
+    PROF_NOTE("%s", "Device Profile Pointers:\n");
+    PROF_NOTE("  Counters:        %p - %p\n", DevCntsBegin, DevCntsEnd);
+    PROF_NOTE("  Data:            %p - %p\n", DevDataBegin, DevDataEnd);
+    PROF_NOTE("  Names:           %p - %p\n", DevNamesBegin, DevNamesEnd);
+    PROF_NOTE("  UniformCounters: %p - %p\n", DevUniformCntsBegin,
+              DevUniformCntsEnd);
+  }
+
+  size_t CountersSize = (char *)DevCntsEnd - (char *)DevCntsBegin;
+  size_t DataSize = (char *)DevDataEnd - (char *)DevDataBegin;
+  size_t NamesSize = (char *)DevNamesEnd - (char *)DevNamesBegin;
+  size_t UniformCountersSize =
+      (char *)DevUniformCntsEnd - (char *)DevUniformCntsBegin;
+
+  if (IsVerboseMode()) {
+    PROF_NOTE("Section sizes: Counters=%zu, Data=%zu, Names=%zu, "
+              "UniformCounters=%zu\n",
+              CountersSize, DataSize, NamesSize, UniformCountersSize);
+  }
+
+  if (CountersSize == 0 || DataSize == 0) {
+    if (IsVerboseMode())
+      PROF_NOTE("%s\n", "Counters or Data section has zero size. No profile "
+                        "data to collect.");
+    return 0;
+  }
+
+  // Pre-register device section symbols with CLR memory tracking.
+  // This makes the section base pointers (and sub-pointers) safe for hipMemcpy.
+  if (IsVerboseMode())
+    PROF_NOTE("Pre-registering %d section symbols\n",
+              NumSectionShadowVariables);
+  for (int i = 0; i < NumSectionShadowVariables; ++i) {
+    void *DevPtr = NULL;
+    (void)hipGetSymbolAddress(&DevPtr, OffloadSectionShadowVariables[i]);
+  }
+
+  char *DeviceFilename = NULL;
+  FILE *File = NULL;
+  int ret = -1;
+
+  // Allocate host memory for the device sections
+  char *HostCountersBegin = (char *)malloc(CountersSize);
+  char *HostDataBegin = (char *)malloc(DataSize);
+  char *HostNamesBegin = (char *)malloc(NamesSize);
+  char *HostUniformCountersBegin =
+      (UniformCountersSize > 0) ? (char *)malloc(UniformCountersSize) : NULL;
+
+  if (!HostCountersBegin || !HostDataBegin ||
+      (NamesSize > 0 && !HostNamesBegin) ||
+      (UniformCountersSize > 0 && !HostUniformCountersBegin)) {
+    PROF_ERR("%s\n", "Failed to allocate host memory for device sections");
+    goto cleanup;
+  }
+
+  // Copy data from device to host using HIP.
+  if (memcpyDeviceToHost(HostCountersBegin, DevCntsBegin, CountersSize) != 0 ||
+      memcpyDeviceToHost(HostDataBegin, DevDataBegin, DataSize) != 0 ||
+      (NamesSize > 0 &&
+       memcpyDeviceToHost(HostNamesBegin, DevNamesBegin, NamesSize) != 0) ||
+      (UniformCountersSize > 0 &&
+       memcpyDeviceToHost(HostUniformCountersBegin, DevUniformCntsBegin,
+                          UniformCountersSize) != 0)) {
+    PROF_ERR("%s\n", "Failed to copy profile sections from device");
+    goto cleanup;
+  }
+
+  if (IsVerboseMode())
+    PROF_NOTE("Copied device sections: Counters=%zu, Data=%zu, Names=%zu, "
+              "UniformCounters=%zu\n",
+              CountersSize, DataSize, NamesSize, UniformCountersSize);
+
+  if (IsVerboseMode() && UniformCountersSize > 0) {
+    PROF_NOTE("Successfully copied %zu bytes of uniform counters from device\n",
+              UniformCountersSize);
+  }
+
+  // Construct the device-specific filename
+  // Format: <base>.<target>[.<TUIndex>].<ext>
+  // TUIndex is included when >= 0 to support multi-TU programs
+  char *BaseFilename = (char *)__llvm_profile_get_filename();
+  if (!BaseFilename) {
+    PROF_ERR("%s\n", "Failed to get base profile filename");
+    goto cleanup;
+  }
+  if (IsVerboseMode())
+    PROF_NOTE("Base profile filename: %s\n", BaseFilename);
+
+  const char *TargetInfix = "amdgcn-amd-amdhsa";
+  const char *Extension = strrchr(BaseFilename, '.');
+  char TUIndexStr[16] = "";
+  if (TUIndex >= 0) {
+    snprintf(TUIndexStr, sizeof(TUIndexStr), ".%d", TUIndex);
+  }
+
+  if (Extension) {
+    size_t BaseLen = Extension - BaseFilename;
+    size_t InfixLen = strlen(TargetInfix);
+    size_t TUIndexLen = strlen(TUIndexStr);
+    size_t ExtLen = strlen(Extension);
+    DeviceFilename =
+        (char *)malloc(BaseLen + 1 + InfixLen + TUIndexLen + ExtLen + 1);
+    strncpy(DeviceFilename, BaseFilename, BaseLen);
+    DeviceFilename[BaseLen] = '\0';
+    strcat(DeviceFilename, ".");
+    strcat(DeviceFilename, TargetInfix);
+    strcat(DeviceFilename, TUIndexStr);
+    strcat(DeviceFilename, Extension);
+  } else {
+    DeviceFilename =
+        (char *)malloc(strlen(BaseFilename) + 1 + strlen(TargetInfix) +
+                       strlen(TUIndexStr) + 1);
+    strcpy(DeviceFilename, BaseFilename);
+    strcat(DeviceFilename, ".");
+    strcat(DeviceFilename, TargetInfix);
+    strcat(DeviceFilename, TUIndexStr);
+  }
+  free(BaseFilename);
+
+  if (IsVerboseMode())
+    PROF_NOTE("Device profile filename: %s\n", DeviceFilename);
+
+  // Manually write the profile data with a proper header
+  File = fopen(DeviceFilename, "w");
+  if (!File) {
+    PROF_ERR("Failed to open %s for writing\n", DeviceFilename);
+    goto cleanup;
+  }
+
+  __llvm_profile_header Header;
+  const uint64_t NumData = DataSize / sizeof(__llvm_profile_data);
+  const uint64_t NumCounters = CountersSize / sizeof(uint64_t);
+  const uint64_t NumBitmapBytes = 0;
+  const uint64_t VTableSectionSize = 0;
+  const uint64_t VNamesSize = 0;
+  uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
+      PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+
+  if (__llvm_profile_get_padding_sizes_for_counters(
+          DataSize, CountersSize, NumBitmapBytes, NamesSize, VTableSectionSize,
+          VNamesSize, &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+          &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames,
+          &PaddingBytesAfterVTable, &PaddingBytesAfterVNames) != 0) {
+    PROF_ERR("%s\n", "Failed to get padding sizes");
+    goto cleanup;
+  }
+
+  // Relocate pointers
+  __llvm_profile_data *RelocatedData = (__llvm_profile_data *)HostDataBegin;
+  for (uint64_t i = 0; i < NumData; ++i) {
+    if (RelocatedData[i].CounterPtr) {
+      ptrdiff_t DeviceCounterPtrOffset = (ptrdiff_t)RelocatedData[i].CounterPtr;
+      void *DeviceDataStructAddr =
+          (char *)DevDataBegin + (i * sizeof(__llvm_profile_data));
+      void *DeviceCountersAddr =
+          (char *)DeviceDataStructAddr + DeviceCounterPtrOffset;
+      ptrdiff_t OffsetIntoCountersSection =
+          (char *)DeviceCountersAddr - (char *)DevCntsBegin;
+
+      ptrdiff_t NewRelativeOffset = DataSize + PaddingBytesBeforeCounters +
+                                    OffsetIntoCountersSection -
+                                    (i * sizeof(__llvm_profile_data));
+      memcpy(&RelocatedData[i].CounterPtr, &NewRelativeOffset,
+             sizeof(NewRelativeOffset));
+    }
+    // Zero out unused fields
+    memset(&RelocatedData[i].BitmapPtr, 0,
+           sizeof(RelocatedData[i].BitmapPtr) +
+               sizeof(RelocatedData[i].FunctionPointer) +
+               sizeof(RelocatedData[i].Values));
+  }
+
+  // Populate header
+  Header.Magic = __llvm_profile_get_magic();
+  Header.Version = __llvm_profile_get_version();
+  Header.BinaryIdsSize = 0; // Not supported for device PGO yet
+  Header.NumData = NumData;
+  Header.PaddingBytesBeforeCounters = PaddingBytesBeforeCounters;
+  Header.NumCounters = NumCounters;
+  Header.PaddingBytesAfterCounters = PaddingBytesAfterCounters;
+  Header.NumBitmapBytes = NumBitmapBytes;
+  Header.PaddingBytesAfterBitmapBytes = PaddingBytesAfterBitmapBytes;
+  Header.NamesSize = NamesSize;
+  Header.CountersDelta = DataSize + PaddingBytesBeforeCounters;
+  Header.BitmapDelta =
+      Header.CountersDelta + CountersSize + PaddingBytesAfterCounters;
+  Header.NamesDelta =
+      Header.BitmapDelta + NumBitmapBytes + PaddingBytesAfterBitmapBytes;
+  Header.NumVTables = 0;
+  Header.VNamesSize = 0;
+  Header.ValueKindLast = 0; // No value profiling
+
+  // Write header and data
+  if (fwrite(&Header, sizeof(__llvm_profile_header), 1, File) != 1)
+    goto write_error;
+  if (fwrite(HostDataBegin, 1, DataSize, File) != DataSize)
+    goto write_error;
+  if (PaddingBytesBeforeCounters > 0 &&
+      fseek(File, PaddingBytesBeforeCounters, SEEK_CUR) != 0)
+    goto write_error;
+  if (fwrite(HostCountersBegin, 1, CountersSize, File) != CountersSize)
+    goto write_error;
+  if (PaddingBytesAfterCounters > 0 &&
+      fseek(File, PaddingBytesAfterCounters, SEEK_CUR) != 0)
+    goto write_error;
+  if (fwrite(HostNamesBegin, 1, NamesSize, File) != NamesSize)
+    goto write_error;
+
+  // Add padding after names to align to 8 bytes (required by profraw reader)
+  {
+    uint64_t NamesPadding = __llvm_profile_get_num_padding_bytes(NamesSize);
+    if (NamesPadding > 0) {
+      char ZeroPadding[8] = {0};
+      if (fwrite(ZeroPadding, 1, NamesPadding, File) != NamesPadding)
+        goto write_error;
+    }
+  }
+
+  if (IsVerboseMode())
+    PROF_NOTE("Successfully wrote profile data to %s\n", DeviceFilename);
+
+  // Write uniform counters to a separate file if available
+  if (UniformCountersSize > 0 && HostUniformCountersBegin) {
+    // Create uniform counters filename by replacing extension with .unifcnts
+    size_t DeviceFilenameLen = strlen(DeviceFilename);
+    char *UniformFilename = (char *)malloc(DeviceFilenameLen + 10);
+    if (UniformFilename) {
+      strcpy(UniformFilename, DeviceFilename);
+      // Find and replace .profraw extension
+      char *ext = strrchr(UniformFilename, '.');
+      if (ext) {
+        strcpy(ext, ".unifcnts");
+      } else {
+        strcat(UniformFilename, ".unifcnts");
+      }
+
+      FILE *UniformFile = fopen(UniformFilename, "wb");
+      if (UniformFile) {
+        // Write a simple header: magic, version, num_counters, counters_size
+        uint64_t UniformMagic = 0x55434E5450524F46ULL; // "UCNTPROF" in ASCII
+        uint64_t UniformVersion = 1;
+        uint64_t NumUniformCounters = UniformCountersSize / sizeof(uint64_t);
+
+        if (fwrite(&UniformMagic, sizeof(uint64_t), 1, UniformFile) != 1 ||
+            fwrite(&UniformVersion, sizeof(uint64_t), 1, UniformFile) != 1 ||
+            fwrite(&NumUniformCounters, sizeof(uint64_t), 1, UniformFile) !=
+                1 ||
+            fwrite(&UniformCountersSize, sizeof(uint64_t), 1, UniformFile) !=
+                1 ||
+            fwrite(HostUniformCountersBegin, 1, UniformCountersSize,
+                   UniformFile) != UniformCountersSize) {
+          PROF_WARN("Failed to write uniform counters to %s\n",
+                    UniformFilename);
+        } else if (IsVerboseMode()) {
+          PROF_NOTE(
+              "Successfully wrote %zu uniform counters (%zu bytes) to %s\n",
+              (size_t)NumUniformCounters, UniformCountersSize, UniformFilename);
+        }
+        fclose(UniformFile);
+      } else {
+        PROF_WARN("Failed to open %s for writing uniform counters\n",
+                  UniformFilename);
+      }
+      free(UniformFilename);
+    }
+  }
+
+  ret = 0;
+  goto cleanup;
+
+write_error:
+  PROF_ERR("Failed to write to %s\n", DeviceFilename);
+
+cleanup:
+  if (File)
+    fclose(File);
+  free(DeviceFilename);
+  free(HostCountersBegin);
+  free(HostDataBegin);
+  free(HostNamesBegin);
+  free(HostUniformCountersBegin);
+  return ret;
+}
+
+static int ProcessShadowVariable(void *ShadowVar, int TUIndex) {
+  void *DeviceOffloadPrf = NULL;
+  if (hipGetSymbolAddress(&DeviceOffloadPrf, ShadowVar) != 0) {
+    PROF_WARN("Failed to get symbol address for shadow variable %p\n",
+              ShadowVar);
+    return -1;
+  }
+  return ProcessDeviceOffloadPrf(DeviceOffloadPrf, TUIndex);
+}
+
+/* -------------------------------------------------------------------------- */
+/*  Collect device-side profile data                                          */
+/* -------------------------------------------------------------------------- */
+
+int __llvm_profile_hip_collect_device_data(void) {
+  if (IsVerboseMode())
+    PROF_NOTE("%s", "__llvm_profile_hip_collect_device_data called\n");
+
+  int Ret = 0;
+
+  /* Shadow variables (static-linked kernels) */
+  /* Always use TU index for consistent naming
+   * (profile.amdgcn-amd-amdhsa.0.profraw, etc.) */
+  for (int i = 0; i < NumShadowVariables; ++i) {
+    if (ProcessShadowVariable(OffloadShadowVariables[i], i) != 0)
+      Ret = -1;
+  }
+
+  /* Dynamically-loaded modules */
+  for (int i = 0; i < NumDynamicModules; ++i) {
+    OffloadDynamicModuleInfo *Info = &DynamicModules[i];
+    if (!Info->Processed) {
+      PROF_WARN("Dynamic module %p was not processed before unload\n",
+                Info->ModulePtr);
+      Ret = -1;
+    }
+  }
+
+  return Ret;
+}
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index f742476ac854a..ea24681521cae 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -894,15 +894,27 @@ struct InstrProfValueSiteRecord {
 struct InstrProfRecord {
   std::vector<uint64_t> Counts;
   std::vector<uint8_t> BitmapBytes;
+  /// For AMDGPU offload profiling: 1 bit per basic block indicating whether
+  /// the block is entered via a wave-uniform branch. Set during merge when
+  /// per-slot counters are reduced. If a counter value is a multiple of the
+  /// wave size, the branch is considered wave-uniform.
+  std::vector<uint8_t> UniformityBits;
+  uint16_t NumOffloadProfilingThreads = 0;
 
   InstrProfRecord() = default;
   InstrProfRecord(std::vector<uint64_t> Counts) : Counts(std::move(Counts)) {}
+  InstrProfRecord(std::vector<uint64_t> Counts,
+                  uint16_t NumOffloadProfilingThreads)
+      : Counts(std::move(Counts)),
+        NumOffloadProfilingThreads(NumOffloadProfilingThreads) {}
   InstrProfRecord(std::vector<uint64_t> Counts,
                   std::vector<uint8_t> BitmapBytes)
       : Counts(std::move(Counts)), BitmapBytes(std::move(BitmapBytes)) {}
   InstrProfRecord(InstrProfRecord &&) = default;
   InstrProfRecord(const InstrProfRecord &RHS)
       : Counts(RHS.Counts), BitmapBytes(RHS.BitmapBytes),
+        UniformityBits(RHS.UniformityBits),
+        NumOffloadProfilingThreads(RHS.NumOffloadProfilingThreads),
         ValueData(RHS.ValueData
                       ? std::make_unique<ValueProfData>(*RHS.ValueData)
                       : nullptr) {}
@@ -910,6 +922,8 @@ struct InstrProfRecord {
   InstrProfRecord &operator=(const InstrProfRecord &RHS) {
     Counts = RHS.Counts;
     BitmapBytes = RHS.BitmapBytes;
+    UniformityBits = RHS.UniformityBits;
+    NumOffloadProfilingThreads = RHS.NumOffloadProfilingThreads;
     if (!RHS.ValueData) {
       ValueData = nullptr;
       return *this;
@@ -921,6 +935,17 @@ struct InstrProfRecord {
     return *this;
   }
 
+  /// Check if a basic block is entered via a wave-uniform branch.
+  /// Returns true if uniform (safe for PGO spill optimization) or if no
+  /// uniformity data is available (conservative default).
+  bool isBlockUniform(unsigned BlockIdx) const {
+    if (UniformityBits.empty())
+      return true; // No uniformity data, assume uniform (conservative)
+    if (BlockIdx / 8 >= UniformityBits.size())
+      return true; // Out of range, assume uniform
+    return (UniformityBits[BlockIdx / 8] >> (BlockIdx % 8)) & 1;
+  }
+
   /// Return the number of value profile kinds with non-zero number
   /// of profile sites.
   inline uint32_t getNumValueKinds() const;
@@ -945,8 +970,12 @@ struct InstrProfRecord {
 
   /// Merge the counts in \p Other into this one.
   /// Optionally scale merged counts by \p Weight.
+  /// If \p WaveSize is non-zero and Other has offload profiling slots,
+  /// compute uniformity bits based on whether counter values are multiples
+  /// of WaveSize.
   LLVM_ABI void merge(InstrProfRecord &Other, uint64_t Weight,
-                      function_ref<void(instrprof_error)> Warn);
+                      function_ref<void(instrprof_error)> Warn,
+                      unsigned WaveSize = 0);
 
   /// Scale up profile counts (including value profile data) by
   /// a factor of (N / D).
@@ -1066,11 +1095,24 @@ struct NamedInstrProfRecord : InstrProfRecord {
   NamedInstrProfRecord(StringRef Name, uint64_t Hash,
                        std::vector<uint64_t> Counts)
       : InstrProfRecord(std::move(Counts)), Name(Name), Hash(Hash) {}
+  NamedInstrProfRecord(StringRef Name, uint64_t Hash,
+                       std::vector<uint64_t> Counts,
+                       uint16_t NumOffloadProfilingThreads)
+      : InstrProfRecord(std::move(Counts), NumOffloadProfilingThreads),
+        Name(Name), Hash(Hash) {}
   NamedInstrProfRecord(StringRef Name, uint64_t Hash,
                        std::vector<uint64_t> Counts,
                        std::vector<uint8_t> BitmapBytes)
       : InstrProfRecord(std::move(Counts), std::move(BitmapBytes)), Name(Name),
         Hash(Hash) {}
+  NamedInstrProfRecord(StringRef Name, uint64_t Hash,
+                       std::vector<uint64_t> Counts,
+                       std::vector<uint8_t> BitmapBytes,
+                       std::vector<uint8_t> UniformityBits)
+      : InstrProfRecord(std::move(Counts), std::move(BitmapBytes)), Name(Name),
+        Hash(Hash) {
+    this->UniformityBits = std::move(UniformityBits);
+  }
 
   static bool hasCSFlagInHash(uint64_t FuncHash) {
     return ((FuncHash >> CS_FLAG_IN_FUNC_HASH) & 1);
@@ -1177,7 +1219,9 @@ enum ProfVersion {
   Version12 = 12,
   // In this version, the frontend PGO stable hash algorithm defaults to V4.
   Version13 = 13,
-  // The current version is 13.
+  // UniformityBits added for AMDGPU offload profiling divergence detection.
+  Version14 = 14,
+  // The current version is 14.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 46d6bb5bd8896..817223a9329ff 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -90,8 +90,12 @@ INSTR_PROF_DATA(IntPtrT, llvm::PointerType::getUnqual(Ctx), Values, \
 INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumCounters, \
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumCounters))
 INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last+1], \
-                ConstantArray::get(Int16ArrayTy, Int16ArrayVals)) \
-INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
+                ConstantArray::get(Int16ArrayTy, Int16ArrayVals))
+INSTR_PROF_DATA(const uint16_t, llvm::Type::getInt16Ty(Ctx),
+                NumOffloadProfilingThreads,
+                ConstantInt::get(llvm::Type::getInt16Ty(Ctx),
+                                 NumOffloadProfilingThreadsVal))
+INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes,
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumBitmapBytes))
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
@@ -324,6 +328,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
 INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
                       INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
                       INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_ucnts, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_UCNTS_COMMON), \
+                      INSTR_PROF_UCNTS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
                       INSTR_PROF_BITS_COFF, "__DATA,")
@@ -722,7 +729,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 13
+#define INSTR_PROF_INDEX_VERSION 14
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -767,6 +774,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
 #define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_UCNTS_COMMON __llvm_prf_ucnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
@@ -784,6 +792,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
 #define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
+#define INSTR_PROF_UCNTS_COFF ".lprfuc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index 1b24425e68a9e..444e87eabc238 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -90,6 +90,12 @@ class InstrProfWriter {
   // to the writer.
   memprof::MemProfSummaryBuilder MemProfSumBuilder;
 
+  // For AMDGPU offload profiling: the wave size used to detect uniform
+  // branches. If non-zero, uniformity bits will be computed during merge when
+  // per-slot counters are reduced. A block is considered uniform if all its
+  // counter values are multiples of WaveSize.
+  unsigned OffloadWaveSize = 0;
+
 public:
   // For memprof testing, random hotness can be assigned to the contexts if
   // MemprofGenerateRandomHotness is enabled. The random seed can be either
@@ -215,6 +221,10 @@ class InstrProfWriter {
     MemProfVersionRequested = Version;
   }
   void setMemProfFullSchema(bool Full) { MemProfFullSchema = Full; }
+
+  /// Set the wave size for AMDGPU offload profiling uniformity detection.
+  /// If non-zero, uniformity bits will be computed during merge.
+  void setOffloadWaveSize(unsigned WaveSize) { OffloadWaveSize = WaveSize; }
   // Compute the overlap b/w this object and Other. Program level result is
   // stored in Overlap and function level result is stored in FuncLevelOverlap.
   LLVM_ABI void overlapRecord(NamedInstrProfRecord &&Other,
diff --git a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
index 6b93b6cb83b4e..efef78eadd31e 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
@@ -286,12 +286,30 @@ template <class Edge, class BBInfo> class CFGMST {
     if (!Message.str().empty())
       OS << Message << "\n";
     OS << "  Number of Basic Blocks: " << BBInfos.size() << "\n";
-    for (auto &BI : BBInfos) {
-      const BasicBlock *BB = BI.first;
+    // Collect and sort BBInfos deterministically by their assigned Index.
+    std::vector<std::pair<const BasicBlock *, const BBInfo *>> SortedBBInfos;
+    SortedBBInfos.reserve(BBInfos.size());
+    for (const auto &BI : BBInfos)
+      SortedBBInfos.emplace_back(BI.first, BI.second.get());
+
+    llvm::sort(SortedBBInfos, [](const auto &A, const auto &B) {
+      // Primary key: BBInfo Index
+      if (A.second->Index != B.second->Index)
+        return A.second->Index < B.second->Index;
+      // Secondary key: name string to keep a stable order even if
+      // indices tie (ties shouldn't happen, but this makes ordering
+      // explicit).
+      StringRef NameA = A.first ? A.first->getName() : StringRef("FakeNode");
+      StringRef NameB = B.first ? B.first->getName() : StringRef("FakeNode");
+      return NameA < NameB;
+    });
+
+    for (const auto &P : SortedBBInfos) {
+      const BasicBlock *BB = P.first;
+      const BBInfo *Info = P.second;
       OS << "  BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << "  "
-         << BI.second->infoString() << "\n";
+         << Info->infoString() << "\n";
     }
-
     OS << "  Number of Edges: " << AllEdges.size()
        << " (*: Instrument, C: CriticalEdge, -: Removed)\n";
     uint32_t Count = 0;
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 6b7e980d048a4..2cc44eea61495 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -2031,7 +2031,8 @@ DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before,
 
     assert(NodePosition.count(Source) == 1 && "Expected to find node.");
     DotCfgDiffNode &SourceNode = Nodes[NodePosition[Source]];
-    assert(NodePosition.count(Sink) == 1 && "Expected to find node.");
+    if (NodePosition.count(Sink) == 0)
+      continue;
     unsigned SinkNode = NodePosition[Sink];
     StringRef Colour = E.second;
 
@@ -2251,7 +2252,12 @@ void DotCfgChangeReporter::handleFunctionCompare(
   // Use the before entry block if the after entry block was removed.
   if (EntryBlockName == "")
     EntryBlockName = Before.getEntryBlockName();
-  assert(EntryBlockName != "" && "Expected to find entry block");
+
+  if (EntryBlockName.empty()) {
+    errs() << "Warning: could not find entry block for function " << Name
+           << ", skipping dot-cfg output for pass " << PassID << ".\n";
+    return;
+  }
 
   DotCfgDiffDisplayGraph DG = Diff.createDisplayGraph(Text, EntryBlockName);
   DG.generateDotFile(DotFile);
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 82469481881c0..7aaf2acd6d5ec 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -957,7 +957,8 @@ void InstrProfRecord::mergeValueProfData(
 }
 
 void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight,
-                            function_ref<void(instrprof_error)> Warn) {
+                            function_ref<void(instrprof_error)> Warn,
+                            unsigned WaveSize) {
   // If the number of counters doesn't match we either have bad data
   // or a hash collision.
   if (Counts.size() != Other.Counts.size()) {
@@ -965,24 +966,92 @@ void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight,
     return;
   }
 
-  // Special handling of the first count as the PseudoCount.
-  CountPseudoKind OtherKind = Other.getCountPseudoKind();
-  CountPseudoKind ThisKind = getCountPseudoKind();
-  if (OtherKind != NotPseudo || ThisKind != NotPseudo) {
-    // We don't allow the merge of a profile with pseudo counts and
-    // a normal profile (i.e. without pesudo counts).
-    // Profile supplimenation should be done after the profile merge.
-    if (OtherKind == NotPseudo || ThisKind == NotPseudo) {
-      Warn(instrprof_error::count_mismatch);
-      return;
+  if (Other.NumOffloadProfilingThreads > 0) {
+    uint64_t NumThreads = Other.NumOffloadProfilingThreads;
+    uint64_t NumCounters = Other.Counts.size() / (NumThreads + 1);
+    std::vector<uint64_t> NewCounts(NumCounters, 0);
+
+    // If WaveSize is specified, compute uniformity bits for each block.
+    // A block is considered wave-uniform if all its per-slot counter values
+    // are multiples of WaveSize (meaning all lanes were active when executed).
+    //
+    // However, if Other.UniformityBits is already set (e.g., from .unifcnts
+    // file), use that instead of the WaveSize-modulo heuristic, as the
+    // .unifcnts-based detection is more accurate for data-dependent divergence.
+    std::vector<uint8_t> NewUniformityBits;
+    bool UseExistingUniformity = !Other.UniformityBits.empty();
+    if (UseExistingUniformity) {
+      // Use the uniformity bits already computed from .unifcnts
+      NewUniformityBits = Other.UniformityBits;
+    } else if (WaveSize > 0) {
+      NewUniformityBits.resize((NumCounters + 7) / 8, 0xFF); // Default: uniform
     }
-    if (OtherKind == PseudoHot || ThisKind == PseudoHot)
-      setPseudoCount(PseudoHot);
-    else
-      setPseudoCount(PseudoWarm);
+
+    for (size_t I = 0; I < NumCounters; ++I) {
+      uint64_t Sum = 0;
+      bool IsUniform = true;
+
+      for (size_t J = 0; J < NumThreads; ++J) {
+        uint64_t RawCount = Other.Counts[I * (NumThreads + 1) + J];
+
+        // Check uniformity: if count is non-zero and not a multiple of
+        // WaveSize, the block was entered via a divergent branch.
+        // Skip this check if we're using existing uniformity bits from
+        // .unifcnts.
+        if (!UseExistingUniformity && WaveSize > 0 && RawCount != 0 &&
+            (RawCount % WaveSize) != 0) {
+          IsUniform = false;
+        }
+
+        bool Overflowed;
+        uint64_t Value =
+            SaturatingMultiplyAdd(RawCount, Weight, uint64_t(0), &Overflowed);
+        if (Value > getInstrMaxCountValue()) {
+          Value = getInstrMaxCountValue();
+          Overflowed = true;
+        }
+        Sum += Value;
+        if (Overflowed)
+          Warn(instrprof_error::counter_overflow);
+      }
+      NewCounts[I] = Sum;
+
+      // Update uniformity bit for this block (only if not using existing bits)
+      if (!UseExistingUniformity && WaveSize > 0 && !IsUniform) {
+        // Clear the bit for non-uniform blocks
+        NewUniformityBits[I / 8] &= ~(1 << (I % 8));
+      }
+    }
+    Counts = NewCounts;
+    if (UseExistingUniformity || WaveSize > 0) {
+      UniformityBits = std::move(NewUniformityBits);
+    }
+    NumOffloadProfilingThreads = 0;
+
+    // Early return: offload data has been processed and reduced.
+    // Don't fall through to the regular merge loop which expects matching
+    // sizes.
     return;
+  } else {
+    // Special handling of the first count as the PseudoCount.
+    CountPseudoKind OtherKind = Other.getCountPseudoKind();
+    CountPseudoKind ThisKind = getCountPseudoKind();
+    if (OtherKind != NotPseudo || ThisKind != NotPseudo) {
+      // We don't allow the merge of a profile with pseudo counts and
+      // a normal profile (i.e. without pesudo counts).
+      // Profile supplimenation should be done after the profile merge.
+      if (OtherKind == NotPseudo || ThisKind == NotPseudo) {
+        Warn(instrprof_error::count_mismatch);
+        return;
+      }
+      if (OtherKind == PseudoHot || ThisKind == PseudoHot)
+        setPseudoCount(PseudoHot);
+      else
+        setPseudoCount(PseudoWarm);
+      return;
+    }
   }
-
+  NumOffloadProfilingThreads = Other.NumOffloadProfilingThreads;
   for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) {
     bool Overflowed;
     uint64_t Value =
@@ -1022,15 +1091,32 @@ void InstrProfRecord::scaleValueProfData(
 void InstrProfRecord::scale(uint64_t N, uint64_t D,
                             function_ref<void(instrprof_error)> Warn) {
   assert(D != 0 && "D cannot be 0");
-  for (auto &Count : this->Counts) {
-    bool Overflowed;
-    Count = SaturatingMultiply(Count, N, &Overflowed) / D;
-    if (Count > getInstrMaxCountValue()) {
-      Count = getInstrMaxCountValue();
-      Overflowed = true;
+  if (NumOffloadProfilingThreads > 0) {
+    uint64_t NumThreads = NumOffloadProfilingThreads;
+    for (size_t I = 0, E = Counts.size(); I < E; I += NumThreads + 1) {
+      for (size_t J = 0; J < NumThreads; ++J) {
+        bool Overflowed;
+        uint64_t &Count = this->Counts[I + J];
+        Count = SaturatingMultiply(Count, N, &Overflowed) / D;
+        if (Count > getInstrMaxCountValue()) {
+          Count = getInstrMaxCountValue();
+          Overflowed = true;
+        }
+        if (Overflowed)
+          Warn(instrprof_error::counter_overflow);
+      }
+    }
+  } else {
+    for (auto &Count : this->Counts) {
+      bool Overflowed;
+      Count = SaturatingMultiply(Count, N, &Overflowed) / D;
+      if (Count > getInstrMaxCountValue()) {
+        Count = getInstrMaxCountValue();
+        Overflowed = true;
+      }
+      if (Overflowed)
+        Warn(instrprof_error::counter_overflow);
     }
-    if (Overflowed)
-      Warn(instrprof_error::counter_overflow);
   }
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
     scaleValueProfData(Kind, N, D, Warn);
@@ -1692,7 +1778,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
       IndexedInstrProf::ProfVersion::CurrentVersion)
     return make_error<InstrProfError>(instrprof_error::unsupported_version);
 
-  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version13,
+  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version14,
                 "Please update the reader as needed when a new field is added "
                 "or when indexed profile version gets bumped.");
 
@@ -1725,10 +1811,11 @@ size_t Header::size() const {
     // of the header, and byte offset of existing fields shouldn't change when
     // indexed profile version gets incremented.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version13,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version14,
         "Please update the size computation below if a new field has "
         "been added to the header; for a version bump without new "
         "fields, add a case statement to fall through to the latest version.");
+  case 14ull: // UniformityBits added in record data, no header change
   case 13ull:
   case 12ull:
     return 72;
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index 65fd5ba1c5ad2..07d9eacf38b01 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -298,6 +298,8 @@ void InstrProfCorrelatorImpl<IntPtrT>::addDataProbe(uint64_t NameRef,
       /*ValuesPtr=*/maybeSwap<IntPtrT>(0),
       maybeSwap<uint32_t>(NumCounters),
       /*NumValueSites=*/{maybeSwap<uint16_t>(0), maybeSwap<uint16_t>(0)},
+      // Offload profiling not used in correlation mode.
+      /*NumOffloadProfilingThreads=*/maybeSwap<uint16_t>(0),
       // TODO: MC/DC is not yet supported.
       /*NumBitmapBytes=*/maybeSwap<uint32_t>(0),
   });
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 8147ee8d0e816..25442d347bcc1 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -723,6 +723,14 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
   if (NumCounters == 0)
     return error(instrprof_error::malformed, "number of counters is zero");
 
+  // For GPU profiles with per-slot counters, the actual number of counter
+  // entries in the file is NumCounters * (NumOffloadProfilingThreads + 1).
+  // NumCounters in the data structure stores the base count (number of blocks),
+  // while the file contains expanded slots for wave-level profiling.
+  uint16_t NumOffloadThreads = swap(Data->NumOffloadProfilingThreads);
+  if (NumOffloadThreads > 0)
+    NumCounters *= (NumOffloadThreads + 1);
+
   ptrdiff_t CounterBaseOffset = swap(Data->CounterPtr) - CountersDelta;
   if (CounterBaseOffset < 0)
     return error(
@@ -873,6 +881,8 @@ Error RawInstrProfReader<IntPtrT>::readNextRecord(NamedInstrProfRecord &Record)
   if (Error E = readFuncHash(Record))
     return error(std::move(E));
 
+  Record.NumOffloadProfilingThreads = swap(Data->NumOffloadProfilingThreads);
+
   // Read raw counts and set Record.
   if (Error E = readRawCounts(Record))
     return error(std::move(E));
@@ -938,32 +948,54 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
                                          offset_type N) {
   using namespace support;
 
+  const unsigned char *OrigD = D;
   // Check if the data is corrupt. If so, don't try to read it.
-  if (N % sizeof(uint64_t))
+  if (N % sizeof(uint64_t)) {
+    fprintf(stderr,
+            "DEBUG: ReadData failed for %s: total data size %lu is not a "
+            "multiple of 8\n",
+            K.data(), (unsigned long)N);
     return data_type();
+  }
 
   DataBuffer.clear();
   std::vector<uint64_t> CounterBuffer;
   std::vector<uint8_t> BitmapByteBuffer;
+  std::vector<uint8_t> UniformityBitsBuffer;
 
   const unsigned char *End = D + N;
   while (D < End) {
     // Read hash.
-    if (D + sizeof(uint64_t) >= End)
+    if (D + sizeof(uint64_t) > End) {
+      fprintf(stderr,
+              "DEBUG: ReadData failed for %s: not enough data for hash. "
+              "Offset: %ld, End: %ld\n",
+              K.data(), (long)(D - OrigD), (long)N);
       return data_type();
+    }
     uint64_t Hash = endian::readNext<uint64_t, llvm::endianness::little>(D);
 
     // Initialize number of counters for GET_VERSION(FormatVersion) == 1.
     uint64_t CountsSize = N / sizeof(uint64_t) - 1;
     // If format version is different then read the number of counters.
     if (GET_VERSION(FormatVersion) != IndexedInstrProf::ProfVersion::Version1) {
-      if (D + sizeof(uint64_t) > End)
+      if (D + sizeof(uint64_t) > End) {
+        fprintf(stderr,
+                "DEBUG: ReadData failed for %s: not enough data for "
+                "CountsSize. Offset: %ld, End: %ld\n",
+                K.data(), (long)(D - OrigD), (long)N);
         return data_type();
+      }
       CountsSize = endian::readNext<uint64_t, llvm::endianness::little>(D);
     }
     // Read counter values.
-    if (D + CountsSize * sizeof(uint64_t) > End)
+    if (D + CountsSize * sizeof(uint64_t) > End) {
+      fprintf(stderr,
+              "DEBUG: ReadData failed for %s: not enough data for counters. "
+              "Offset: %ld, End: %ld, CountsSize: %lu\n",
+              K.data(), (long)(D - OrigD), (long)N, (unsigned long)CountsSize);
       return data_type();
+    }
 
     CounterBuffer.clear();
     CounterBuffer.reserve(CountsSize);
@@ -974,25 +1006,82 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
     // Read bitmap bytes for GET_VERSION(FormatVersion) > 10.
     if (GET_VERSION(FormatVersion) > IndexedInstrProf::ProfVersion::Version10) {
       uint64_t BitmapBytes = 0;
-      if (D + sizeof(uint64_t) > End)
+      if (D + sizeof(uint64_t) > End) {
+        fprintf(stderr,
+                "DEBUG: ReadData failed for %s: not enough data for "
+                "BitmapBytes size. Offset: %ld, End: %ld\n",
+                K.data(), (long)(D - OrigD), (long)N);
         return data_type();
+      }
       BitmapBytes = endian::readNext<uint64_t, llvm::endianness::little>(D);
       // Read bitmap byte values.
-      if (D + BitmapBytes * sizeof(uint8_t) > End)
+      uint64_t PaddedBitmapBytesSize = alignTo(BitmapBytes, sizeof(uint64_t));
+      if (D + PaddedBitmapBytesSize > End) {
+        fprintf(
+            stderr,
+            "DEBUG: ReadData failed for %s: not enough data for bitmap bytes. "
+            "Offset: %ld, End: %ld, BitmapBytes: %lu, PaddedSize: %lu\n",
+            K.data(), (long)(D - OrigD), (long)N, (unsigned long)BitmapBytes,
+            (unsigned long)PaddedBitmapBytesSize);
         return data_type();
+      }
       BitmapByteBuffer.clear();
       BitmapByteBuffer.reserve(BitmapBytes);
       for (uint64_t J = 0; J < BitmapBytes; ++J)
-        BitmapByteBuffer.push_back(static_cast<uint8_t>(
-            endian::readNext<uint64_t, llvm::endianness::little>(D)));
+        BitmapByteBuffer.push_back(
+            endian::readNext<uint8_t, llvm::endianness::little>(D));
+      // Skip padding.
+      for (uint64_t J = BitmapBytes; J < PaddedBitmapBytesSize; ++J)
+        (void)endian::readNext<uint8_t, llvm::endianness::little>(D);
+
+      // Read uniformity bits for Version14+ (AMDGPU offload profiling).
+      if (GET_VERSION(FormatVersion) >=
+          IndexedInstrProf::ProfVersion::Version14) {
+        uint64_t UniformityBitsSize = 0;
+        if (D + sizeof(uint64_t) > End) {
+          fprintf(stderr,
+                  "DEBUG: ReadData failed for %s: not enough data for "
+                  "UniformityBits size. Offset: %ld, End: %ld\n",
+                  K.data(), (long)(D - OrigD), (long)N);
+          return data_type();
+        }
+        UniformityBitsSize =
+            endian::readNext<uint64_t, llvm::endianness::little>(D);
+        uint64_t PaddedUniformityBitsSize =
+            alignTo(UniformityBitsSize, sizeof(uint64_t));
+        if (D + PaddedUniformityBitsSize > End) {
+          fprintf(stderr,
+                  "DEBUG: ReadData failed for %s: not enough data for "
+                  "uniformity bits. "
+                  "Offset: %ld, End: %ld, UniformityBitsSize: %lu, "
+                  "PaddedSize: %lu\n",
+                  K.data(), (long)(D - OrigD), (long)N,
+                  (unsigned long)UniformityBitsSize,
+                  (unsigned long)PaddedUniformityBitsSize);
+          return data_type();
+        }
+        UniformityBitsBuffer.clear();
+        UniformityBitsBuffer.reserve(UniformityBitsSize);
+        for (uint64_t J = 0; J < UniformityBitsSize; ++J)
+          UniformityBitsBuffer.push_back(
+              endian::readNext<uint8_t, llvm::endianness::little>(D));
+        // Skip padding.
+        for (uint64_t J = UniformityBitsSize; J < PaddedUniformityBitsSize; ++J)
+          (void)endian::readNext<uint8_t, llvm::endianness::little>(D);
+      }
     }
 
     DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer),
-                            std::move(BitmapByteBuffer));
+                            std::move(BitmapByteBuffer),
+                            std::move(UniformityBitsBuffer));
 
     // Read value profiling data.
     if (GET_VERSION(FormatVersion) > IndexedInstrProf::ProfVersion::Version2 &&
         !readValueProfilingData(D, End)) {
+      fprintf(stderr,
+              "DEBUG: ReadData failed for %s: readValueProfilingData failed. "
+              "Offset: %ld, End: %ld\n",
+              K.data(), (long)(D - OrigD), (long)N);
       DataBuffer.clear();
       return data_type();
     }
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 0f15ca8ff6df7..6fe624ee7c147 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -51,6 +51,7 @@ class InstrProfRecordWriterTrait {
   llvm::endianness ValueProfDataEndianness = llvm::endianness::little;
   InstrProfSummaryBuilder *SummaryBuilder;
   InstrProfSummaryBuilder *CSSummaryBuilder;
+  bool WritePrevVersion = false;
 
   InstrProfRecordWriterTrait() = default;
 
@@ -58,7 +59,7 @@ class InstrProfRecordWriterTrait {
     return IndexedInstrProf::ComputeHash(K);
   }
 
-  static std::pair<offset_type, offset_type>
+  std::pair<offset_type, offset_type>
   EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
     using namespace support;
 
@@ -72,9 +73,18 @@ class InstrProfRecordWriterTrait {
       const InstrProfRecord &ProfRecord = ProfileData.second;
       M += sizeof(uint64_t); // The function hash
       M += sizeof(uint64_t); // The size of the Counts vector
-      M += ProfRecord.Counts.size() * sizeof(uint64_t);
+      size_t NumCounters = ProfRecord.Counts.size();
+      if (ProfRecord.NumOffloadProfilingThreads > 0) {
+        NumCounters /= (ProfRecord.NumOffloadProfilingThreads + 1);
+      }
+      M += NumCounters * sizeof(uint64_t);
       M += sizeof(uint64_t); // The size of the Bitmap vector
-      M += ProfRecord.BitmapBytes.size() * sizeof(uint64_t);
+      M += alignTo(ProfRecord.BitmapBytes.size(), sizeof(uint64_t));
+      // UniformityBits is only written in Version14+
+      if (!WritePrevVersion) {
+        M += sizeof(uint64_t); // The size of the UniformityBits vector
+        M += alignTo(ProfRecord.UniformityBits.size(), sizeof(uint64_t));
+      }
 
       // Value data
       M += ValueProfData::getSize(ProfileData.second);
@@ -88,7 +98,8 @@ class InstrProfRecordWriterTrait {
     Out.write(K.data(), N);
   }
 
-  void EmitData(raw_ostream &Out, key_type_ref, data_type_ref V, offset_type) {
+  void EmitData(raw_ostream &Out, key_type_ref K, data_type_ref V,
+                offset_type) {
     using namespace support;
 
     endian::Writer LE(Out, llvm::endianness::little);
@@ -100,13 +111,41 @@ class InstrProfRecordWriterTrait {
         SummaryBuilder->addRecord(ProfRecord);
 
       LE.write<uint64_t>(ProfileData.first); // Function hash
-      LE.write<uint64_t>(ProfRecord.Counts.size());
-      for (uint64_t I : ProfRecord.Counts)
-        LE.write<uint64_t>(I);
+      if (ProfRecord.NumOffloadProfilingThreads > 0) {
+        uint64_t NumThreads = ProfRecord.NumOffloadProfilingThreads;
+        uint64_t NumCounters = ProfRecord.Counts.size() / (NumThreads + 1);
+        LE.write<uint64_t>(NumCounters);
+        for (size_t I = 0; I < NumCounters; ++I) {
+          uint64_t Sum = 0;
+          for (size_t J = 0; J < NumThreads; ++J)
+            Sum += ProfRecord.Counts[I * (NumThreads + 1) + J];
+          LE.write<uint64_t>(Sum);
+        }
+      } else {
+        LE.write<uint64_t>(ProfRecord.Counts.size());
+        for (uint64_t I : ProfRecord.Counts)
+          LE.write<uint64_t>(I);
+      }
 
       LE.write<uint64_t>(ProfRecord.BitmapBytes.size());
-      for (uint64_t I : ProfRecord.BitmapBytes)
-        LE.write<uint64_t>(I);
+      for (uint8_t I : ProfRecord.BitmapBytes)
+        LE.write<uint8_t>(I);
+      // Pad with zeros.
+      for (size_t I = ProfRecord.BitmapBytes.size();
+           I < alignTo(ProfRecord.BitmapBytes.size(), sizeof(uint64_t)); ++I)
+        LE.write<uint8_t>(0);
+
+      // Write uniformity bits (for AMDGPU offload profiling, Version14+)
+      if (!WritePrevVersion) {
+        LE.write<uint64_t>(ProfRecord.UniformityBits.size());
+        for (uint8_t I : ProfRecord.UniformityBits)
+          LE.write<uint8_t>(I);
+        // Pad with zeros.
+        for (size_t I = ProfRecord.UniformityBits.size();
+             I < alignTo(ProfRecord.UniformityBits.size(), sizeof(uint64_t));
+             ++I)
+          LE.write<uint8_t>(0);
+      }
 
       // Write value data
       std::unique_ptr<ValueProfData> VDataPtr =
@@ -207,9 +246,19 @@ void InstrProfWriter::addRecord(StringRef Name, uint64_t Hash,
     Dest = std::move(I);
     if (Weight > 1)
       Dest.scale(Weight, 1, MapWarn);
+    // For new records with offload profiling slots, compute uniformity bits
+    // if WaveSize is specified.
+    if (OffloadWaveSize > 0 && Dest.NumOffloadProfilingThreads > 0) {
+      // Create a temporary record to merge into an empty one to trigger
+      // uniformity computation.
+      InstrProfRecord Temp;
+      Temp.Counts.resize(Dest.Counts.size());
+      Temp.merge(Dest, 1, MapWarn, OffloadWaveSize);
+      Dest = std::move(Temp);
+    }
   } else {
     // We're updating a function we've seen before.
-    Dest.merge(I, Weight, MapWarn);
+    Dest.merge(I, Weight, MapWarn, OffloadWaveSize);
   }
 
   Dest.sortValueData();
@@ -524,6 +573,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   InfoObj->SummaryBuilder = &ISB;
   InstrProfSummaryBuilder CSISB(ProfileSummaryBuilder::DefaultCutoffs);
   InfoObj->CSSummaryBuilder = &CSISB;
+  InfoObj->WritePrevVersion = WritePrevVersion;
 
   // Populate the hash table generator.
   SmallVector<std::pair<StringRef, const ProfilingData *>> OrderedData;
@@ -542,7 +592,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // The WritePrevVersion handling will either need to be removed or updated
   // if the version is advanced beyond 12.
   static_assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
-                IndexedInstrProf::ProfVersion::Version13);
+                IndexedInstrProf::ProfVersion::Version14);
   if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     Header.Version |= VARIANT_MASK_IR_PROF;
   if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index f6f73fb69f7a9..f12edbe20f65d 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -33,12 +34,15 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -160,6 +164,28 @@ cl::opt<bool> SpeculativeCounterPromotionToLoop(
              " update can be further/iteratively promoted into an acyclic "
              " region."));
 
+static cl::opt<unsigned> OffloadProfilingThreadBitWidth(
+    "offload-profiling-thread-bitwidth",
+    cl::desc("Bit width that encodes the number of profiling threads on the "
+             "offload device.  The actual thread count used is "
+             "(1 << bitwidth) - 1.  Supported for AMDGPU only."),
+    cl::init(8));
+
+enum class OffloadPGOSamplingMode {
+  PatternOverflow, // sampling by pattern, overflow slot, non-atomic store
+  AtomicWarpLeader // no sampling; warp leader uses atomicrmw add 1
+};
+
+static llvm::cl::opt<OffloadPGOSamplingMode> OffloadPGOSampling(
+    "offload-pgo-sampling-mode", llvm::cl::desc("Offload PGO sampling mode"),
+    llvm::cl::values(
+        clEnumValN(OffloadPGOSamplingMode::PatternOverflow, "pattern-overflow",
+                   "Use sampling pattern and overflow slot (default)"),
+        clEnumValN(
+            OffloadPGOSamplingMode::AtomicWarpLeader, "atomic-warp-leader",
+            "Leader lane only; atomic increment per slot; no overflow slot")),
+    llvm::cl::init(OffloadPGOSamplingMode::AtomicWarpLeader));
+
 cl::opt<bool> IterativeCounterPromotion(
     "iterative-counter-promotion", cl::init(true),
     cl::desc("Allow counter promotion across the whole loop nest."));
@@ -242,6 +268,20 @@ static bool profDataReferencedByCode(const Module &M) {
   return enablesValueProfiling(M);
 }
 
+// Extract CUID (Compilation Unit ID) from the module.
+// HIP/CUDA modules have a global variable __hip_cuid_<hash> that uniquely
+// identifies each translation unit. Returns empty string if not found.
+static std::string getCUIDFromModule(const Module &M) {
+  for (const GlobalVariable &GV : M.globals()) {
+    StringRef Name = GV.getName();
+    if (Name.starts_with("__hip_cuid_")) {
+      // Extract the hash suffix after "__hip_cuid_"
+      return Name.drop_front(strlen("__hip_cuid_")).str();
+    }
+  }
+  return "";
+}
+
 class InstrLowerer final {
 public:
   InstrLowerer(Module &M, const InstrProfOptions &Options,
@@ -266,7 +306,8 @@ class InstrLowerer final {
   struct PerFunctionProfileData {
     uint32_t NumValueSites[IPVK_Last + 1] = {};
     GlobalVariable *RegionCounters = nullptr;
-    GlobalVariable *DataVar = nullptr;
+    GlobalVariable *UniformCounters = nullptr; // For AMDGPU divergence tracking
+    GlobalValue *DataVar = nullptr;
     GlobalVariable *RegionBitmaps = nullptr;
     uint32_t NumBitmapBytes = 0;
 
@@ -288,6 +329,24 @@ class InstrLowerer final {
   GlobalVariable *NamesVar = nullptr;
   size_t NamesSize = 0;
 
+  // For GPU targets: per-TU contiguous allocation of profile data.
+  // Instead of separate per-function counters (which linker can reorder),
+  // we allocate one contiguous array for all counters in the TU.
+  GlobalVariable *ContiguousCnts = nullptr; // All counters in one array
+  GlobalVariable *ContiguousData =
+      nullptr; // All __llvm_profile_data in one array
+  GlobalVariable *ContiguousUCnts =
+      nullptr; // All uniform counters in one array
+  StructType *ProfileDataTy = nullptr;
+  SmallVector<Constant *, 16> ContiguousDataInits;
+  std::string CachedCUID; // CUID cached for consistent section naming
+
+  // Map from function name GlobalVariable to offset in contiguous arrays
+  DenseMap<GlobalVariable *, uint64_t> FunctionCounterOffsets;
+  DenseMap<GlobalVariable *, uint64_t> FunctionDataOffsets;
+  uint64_t TotalCounterSlots = 0; // Total slots across all functions
+  uint64_t TotalDataEntries = 0;  // Total __llvm_profile_data entries
+
   // vector of counter load/store pairs to be register promoted.
   std::vector<LoadStorePair> PromotionCandidates;
 
@@ -325,6 +384,9 @@ class InstrLowerer final {
   /// Replace instrprof.increment with an increment of the appropriate value.
   void lowerIncrement(InstrProfIncrementInst *Inc);
 
+  /// AMDGPU specific implementation of lowerIncrement.
+  void lowerIncrementAMDGPU(InstrProfIncrementInst *Inc);
+
   /// Force emitting of name vars for unused functions.
   void lowerCoverageData(GlobalVariable *CoverageNamesVar);
 
@@ -349,6 +411,10 @@ class InstrLowerer final {
   /// referring to them will also be created.
   GlobalVariable *getOrCreateRegionCounters(InstrProfCntrInstBase *Inc);
 
+  /// Get the uniform entry counters for AMDGPU divergence tracking.
+  /// These counters track how often blocks are entered with all lanes active.
+  GlobalVariable *getOrCreateUniformCounters(InstrProfCntrInstBase *Inc);
+
   /// Create the region counters.
   GlobalVariable *createRegionCounters(InstrProfCntrInstBase *Inc,
                                        StringRef Name,
@@ -408,6 +474,30 @@ class InstrLowerer final {
   /// Create a static initializer for our data, on platforms that need it,
   /// and for any profile output file that was specified.
   void emitInitialization();
+
+  /// For GPU targets: Collect all profiling intrinsics and allocate
+  /// contiguous arrays for counters, data, and uniform counters.
+  /// This avoids linker reordering issues with section boundaries.
+  void allocateContiguousProfileArrays();
+
+  /// Return the __llvm_profile_data struct type.
+  StructType *getProfileDataTy();
+
+  /// Finalize initializer for contiguous __llvm_profile_data array.
+  void finalizeContiguousProfileData();
+
+  /// Create __llvm_offload_prf structure for GPU targets.
+  /// Must be called AFTER contiguous arrays are allocated.
+  void createProfileSectionSymbols();
+
+  /// Create HIP device variable registration for profile symbols
+  void createHIPDeviceVariableRegistration();
+
+  /// Create HIP dynamic module registration call
+  void createHIPDynamicModuleRegistration();
+
+  /// Create HIP dynamic module unregistration call
+  void createHIPDynamicModuleUnregistration();
 };
 
 ///
@@ -939,6 +1029,10 @@ bool InstrLowerer::lower() {
   if (!ContainsProfiling && !CoverageNamesVar)
     return MadeChange;
 
+  // For GPU targets: allocate contiguous arrays for all profile data.
+  // This avoids linker reordering issues with per-function arrays.
+  allocateContiguousProfileArrays();
+
   // We did not know how many value sites there would be inside
   // the instrumented function. This is counting the number of instrumented
   // target value sites to enter it as field in the profile data variable.
@@ -983,10 +1077,22 @@ bool InstrLowerer::lower() {
   if (!MadeChange)
     return false;
 
+  finalizeContiguousProfileData();
+
   emitVNodes();
   emitNameData();
   emitVTableNames();
 
+  // Create start/stop symbols for device code profile sections
+  createProfileSectionSymbols();
+
+  // Create host shadow variables and registration calls for HIP device profile
+  // symbols
+  createHIPDeviceVariableRegistration();
+
+  createHIPDynamicModuleRegistration();
+  createHIPDynamicModuleUnregistration();
+
   // Emit runtime hook for the cases where the target does not unconditionally
   // require pulling in profile runtime, and coverage is enabled on code that is
   // not eliminated by the front-end, e.g. unused functions with internal
@@ -1046,7 +1152,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   assert(It != ProfileDataMap.end() && It->second.DataVar &&
          "value profiling detected in function with no counter increment");
 
-  GlobalVariable *DataVar = It->second.DataVar;
+  GlobalValue *DataVar = It->second.DataVar;
   uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
   uint64_t Index = Ind->getIndex()->getZExtValue();
   for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind)
@@ -1058,7 +1164,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   CallInst *Call = nullptr;
   auto *TLI = &GetTLI(*Ind->getFunction());
   auto *NormalizedDataVarPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-      DataVar, PointerType::get(M.getContext(), 0));
+      cast<Constant>(DataVar), PointerType::get(M.getContext(), 0));
 
   // To support value profiling calls within Windows exception handlers, funclet
   // information contained within operand bundles needs to be copied over to
@@ -1108,6 +1214,9 @@ GlobalVariable *InstrLowerer::getOrCreateBiasVar(StringRef VarName) {
 }
 
 Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) {
+  // Note: For AMDGPU targets, lowerIncrementAMDGPU handles counter addressing
+  // directly using ContiguousCnts. This function is called for non-AMDGPU
+  // targets.
   auto *Counters = getOrCreateRegionCounters(I);
   IRBuilder<> Builder(I);
 
@@ -1190,6 +1299,10 @@ void InstrLowerer::lowerTimestamp(
 }
 
 void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
+  if (TT.isAMDGPU()) {
+    lowerIncrementAMDGPU(Inc);
+    return;
+  }
   auto *Addr = getCounterAddress(Inc);
 
   IRBuilder<> Builder(Inc);
@@ -1208,6 +1321,349 @@ void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
   Inc->eraseFromParent();
 }
 
+// Lowers an InstrProfIncrementInst for AMDGPU to per-wave aggregated counter
+// updates. It computes a "slot" index based on block and warp-local indices,
+// elects a leader lane, and increments a counter by (Inc->getStep() *
+// number_of_active_lanes) to reflect that only one lane performs the update on
+// behalf of the whole wave.
+//
+// Assumptions:
+// - AMDGPU Wave32 (32 lanes): uses ballot.i32, mbcnt.lo with a full mask, and
+// lane = mbcnt & 31.
+// - OffloadProfilingThreadBitWidth (KSlotBits) >= 5; kWarpBits is 5 for Wave32.
+// - Two modes:
+//   - PatternOverflow: performs a non-atomic RMW, routes to an overflow slot
+//   based on sampling.
+//   - AtomicWarpLeader: only the elected leader performs an atomic add.
+// - Inc->getStep() is an LLVM integer-typed Value (often a constant 1), and may
+// not equal 1.
+// - The increment amount in both modes is Inc->getStep() *
+// popcount(activeMask).
+void InstrLowerer::lowerIncrementAMDGPU(InstrProfIncrementInst *Inc) {
+  IRBuilder<> Builder(Inc);
+  LLVMContext &Context = M.getContext();
+  auto *Int1Ty = Type::getInt1Ty(Context);
+  auto *Int8Ty = Type::getInt8Ty(Context);
+  auto *Int16Ty = Type::getInt16Ty(Context);
+  auto *Int32Ty = Type::getInt32Ty(Context);
+  auto *Int64Ty = Type::getInt64Ty(Context);
+
+  // Constants/configuration
+  const unsigned KSlotBits =
+      OffloadProfilingThreadBitWidth; // must be >= 5 (Wave32)
+  const unsigned KSlots = 1u << KSlotBits;
+  const unsigned KOverflow = KSlots - 1u; // only used in PatternOverflow mode
+  const unsigned KPattern14 = 0x2A3Fu;    // only used in PatternOverflow mode
+  const unsigned kWarpBits = 5u;          // Wave32 lane width
+
+  if (KSlotBits < kWarpBits)
+    report_fatal_error(
+        "OffloadProfilingThreadBitWidth must be >= 5 for wave32");
+
+  // --- Get thread and block identifiers ---
+  FunctionCallee BlockIdxFn =
+      M.getOrInsertFunction("llvm.amdgcn.workgroup.id.x", Int32Ty);
+  Value *BlockIdx = Builder.CreateCall(BlockIdxFn, {}, "BlockIdxX");
+
+  FunctionCallee ThreadIdxFn =
+      M.getOrInsertFunction("llvm.amdgcn.workitem.id.x", Int32Ty);
+  Value *ThreadIdx = Builder.CreateCall(ThreadIdxFn, {}, "ThreadIdxX");
+
+  // --- Get launch-time data from implicit arguments ---
+  FunctionCallee ImplicitArgFn = M.getOrInsertFunction(
+      "llvm.amdgcn.implicitarg.ptr", PointerType::get(Context, 4));
+  Value *ImplicitArgPtr = Builder.CreateCall(ImplicitArgFn, {});
+
+  // gridDim.x (i32) at base
+  Value *GridDimX = Builder.CreateLoad(Int32Ty, ImplicitArgPtr, "GridDimX");
+
+  // blockDim.x (i16) at offset 12
+  Value *BlockDimXAddr = Builder.CreateInBoundsGEP(
+      Int8Ty, ImplicitArgPtr, ConstantInt::get(Int64Ty, 12), "BlockDimXAddr");
+  Value *BlockDimX = Builder.CreateLoad(Int16Ty, BlockDimXAddr, "BlockDimX");
+
+  // --- Optional: 64-bit gid (not used by slot calc, but useful to keep) ---
+  Value *BlockIdx64 = Builder.CreateZExt(BlockIdx, Int64Ty, "BlockIdxX.zext");
+  Value *ThreadIdx64 =
+      Builder.CreateZExt(ThreadIdx, Int64Ty, "ThreadIdxX.zext");
+  Value *BlockDimX64 = Builder.CreateZExt(BlockDimX, Int64Ty, "BlockDimX.zext");
+  Value *Gid = Builder.CreateAdd(Builder.CreateMul(BlockIdx64, BlockDimX64),
+                                 ThreadIdx64, "Gid");
+  (void)Gid;
+
+  // ----------------------------
+  // Common slot computation (Wave32)
+  // ----------------------------
+
+  // lane id via amdgcn.mbcnt.lo (count active lanes below me); lane = & 31
+  auto *MbcntLoTy = FunctionType::get(Int32Ty, {Int32Ty, Int32Ty}, false);
+  FunctionCallee MbcntLoFnByName =
+      M.getOrInsertFunction("llvm.amdgcn.mbcnt.lo", MbcntLoTy);
+  Value *MbcntLo = Builder.CreateCall(
+      MbcntLoFnByName,
+      {ConstantInt::getSigned(Int32Ty, -1), ConstantInt::get(Int32Ty, 0)},
+      "mbcnt.lo");
+  Value *Lane = Builder.CreateAnd(MbcntLo, 31, "lane");
+
+  // warpLocal = threadIdx.x >> 5
+  Value *WarpLocal = Builder.CreateLShr(ThreadIdx, kWarpBits, "warpLocal");
+
+  // blockBits = (gridDim.x > 1) ? (32 - ctlz(gridDim.x - 1)) : 1
+  Value *GridGt1 = Builder.CreateICmpUGT(GridDimX, ConstantInt::get(Int32Ty, 1),
+                                         "grid_gt_1");
+  Value *GridDimXMinus1 = Builder.CreateSub(
+      GridDimX, ConstantInt::get(Int32Ty, 1), "gridDimX_minus_1");
+  FunctionCallee CtlzI32Fn =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ctlz, {Int32Ty});
+  Value *CtlzVal = Builder.CreateCall(
+      CtlzI32Fn, {GridDimXMinus1, Builder.getFalse()}, "ctlz_gridDimX_minus_1");
+  Value *BlockBitsCandidate = Builder.CreateSub(ConstantInt::get(Int32Ty, 32),
+                                                CtlzVal, "blockBits_cand");
+  Value *BlockBits = Builder.CreateSelect(
+      GridGt1, BlockBitsCandidate, ConstantInt::get(Int32Ty, 1), "blockBits");
+
+  // usedForHi = min(blockBits, KSlotBits - kWarpBits)
+  Value *SlotHiBits = ConstantInt::get(Int32Ty, (int)(KSlotBits - kWarpBits));
+  Value *BlockLtSlotHi = Builder.CreateICmpULT(BlockBits, SlotHiBits);
+  Value *UsedForHi =
+      Builder.CreateSelect(BlockLtSlotHi, BlockBits, SlotHiBits, "usedForHi");
+
+  // sampBits = blockBits - usedForHi
+  Value *SampBits = Builder.CreateSub(BlockBits, UsedForHi, "sampBits");
+  Value *SampBitsIsZero = Builder.CreateIsNull(SampBits, "sampBits_is_zero");
+
+  // blockHi = (sampBits == 0) ? blockIdx.x : (blockIdx.x >> sampBits)
+  Value *BlockHiShifted =
+      Builder.CreateLShr(BlockIdx, SampBits, "blockHi_shifted");
+  Value *BlockHi =
+      Builder.CreateSelect(SampBitsIsZero, BlockIdx, BlockHiShifted, "blockHi");
+
+  // slotRaw = (blockHi << 5) | warpLocal
+  Value *SlotRawUpper = Builder.CreateShl(BlockHi, kWarpBits, "slotRaw_upper");
+  Value *SlotRaw = Builder.CreateOr(SlotRawUpper, WarpLocal, "slotRaw");
+
+  // Find warp leader using ballot.i32 + cttz
+  auto *BallotTy = FunctionType::get(Int32Ty, {Int1Ty}, false);
+  FunctionCallee BallotI32ByName =
+      M.getOrInsertFunction("llvm.amdgcn.ballot.i32", BallotTy);
+  Value *ActiveMask = Builder.CreateCall(
+      BallotI32ByName, {ConstantInt::getTrue(Context)}, "activeMask");
+
+  FunctionCallee CttzI32Fn =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::cttz, {Int32Ty});
+  Value *ActiveMaskNonZero =
+      Builder.CreateICmpNE(ActiveMask, ConstantInt::get(Int32Ty, 0), "mask_nz");
+  Value *LeaderLane = Builder.CreateCall(
+      CttzI32Fn, {ActiveMask, ConstantInt::getTrue(Context)}, "leaderLane");
+  Value *IsLeader = Builder.CreateICmpEQ(Lane, LeaderLane, "isLeader");
+  Value *IsLeaderGuarded =
+      Builder.CreateSelect(ActiveMaskNonZero, IsLeader,
+                           ConstantInt::getFalse(Context), "isLeader_guarded");
+
+  // Compute number of active lanes and step * active lanes
+  FunctionCallee CtpopI32Fn =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ctpop, {Int32Ty});
+  Value *NumActive = Builder.CreateCall(CtpopI32Fn, {ActiveMask}, "numActive");
+
+  Value *IncStep = Inc->getStep(); // integer-typed Value (often i64)
+  Value *NumActiveCast = Builder.CreateZExtOrTrunc(
+      NumActive, IncStep->getType(), "numActive.cast");
+  Value *StepTimesActive =
+      Builder.CreateMul(IncStep, NumActiveCast, "step_times_active");
+
+  // Check if all lanes are active (uniform execution for wave32)
+  // A full wave has ActiveMask == 0xFFFFFFFF. Partial waves (last wave of
+  // workgroup) will be conservatively marked as divergent.
+  Value *FullWaveMask =
+      ConstantInt::getSigned(Int32Ty, -1); // 0xFFFFFFFF for wave32
+  Value *IsUniform =
+      Builder.CreateICmpEQ(ActiveMask, FullWaveMask, "isUniform");
+
+  // ----------------------------
+  // Mode-dependent writer logic
+  // ----------------------------
+
+  Value *Slot = nullptr;
+  Value *IsWriter = nullptr;
+
+  if (OffloadPGOSampling == OffloadPGOSamplingMode::PatternOverflow) {
+    // Sampling mask/pattern over low sampBits of blockIdx.x
+    Value *One32 = ConstantInt::get(Int32Ty, 1);
+    Value *SampMaskShift =
+        Builder.CreateShl(One32, SampBits, "sampMask_shift"); // 1<<sampBits
+    Value *SampMaskMinus1 =
+        Builder.CreateSub(SampMaskShift, One32, "sampMask_minus1");
+    Value *SampMask =
+        Builder.CreateSelect(SampBitsIsZero, ConstantInt::get(Int32Ty, 0),
+                             SampMaskMinus1, "sampMask");
+
+    // sampPat = KPattern14 & sampMask
+    Value *SampPat = Builder.CreateAnd(ConstantInt::get(Int32Ty, KPattern14),
+                                       SampMask, "sampPat");
+
+    // matched = (sampBits == 0) ? true : ((blockIdx.x & sampMask) == sampPat)
+    Value *BlockMasked = Builder.CreateAnd(BlockIdx, SampMask, "blockMasked");
+    Value *CmpMaskPat =
+        Builder.CreateICmpEQ(BlockMasked, SampPat, "cmp_mask_pat");
+    Value *Matched = Builder.CreateSelect(
+        SampBitsIsZero, ConstantInt::getTrue(Context), CmpMaskPat, "matched");
+
+    // Only leader writes when matched
+    IsWriter = Builder.CreateAnd(IsLeaderGuarded, Matched, "isWriter");
+
+    // Route to overflow if not writer or slotRaw == KOverflow
+    Value *SlotRawIsOverflow = Builder.CreateICmpEQ(
+        SlotRaw, ConstantInt::get(Int32Ty, KOverflow), "slot_is_overflow");
+    Value *GoodWriter = Builder.CreateAnd(
+        IsWriter, Builder.CreateNot(SlotRawIsOverflow), "goodWriter");
+    Slot = Builder.CreateSelect(GoodWriter, SlotRaw,
+                                ConstantInt::get(Int32Ty, KOverflow), "Slot");
+  } else {
+    // AtomicWarpLeader: no sampling, no overflow. Only the leader writes
+    // atomically.
+    IsWriter = IsLeaderGuarded;
+    Slot = SlotRaw;
+  }
+
+  // --- Calculate final counter index ---
+  auto *OldCounterIdx = Inc->getIndex();
+  auto *NumSlots = Builder.getInt32(KSlots);
+  auto *CounterIdxBase = Builder.CreateMul(OldCounterIdx, NumSlots);
+  auto *CounterIdx = Builder.CreateAdd(CounterIdxBase, Slot, "CounterIdx");
+
+  // --- Counter address ---
+  // For contiguous allocation, use the contiguous array with function offset
+  GlobalVariable *Counters = nullptr;
+  GlobalVariable *UniformCounters = nullptr;
+  Value *Addr = nullptr;
+  Value *UniformAddr = nullptr;
+
+  if (ContiguousCnts) {
+    // Contiguous allocation mode: use offset into shared array
+    GlobalVariable *NamePtr = Inc->getName();
+    uint64_t FuncOffset = FunctionCounterOffsets.lookup(NamePtr);
+
+    // Add function offset to counter index
+    Value *OffsetCounterIdx = Builder.CreateAdd(
+        CounterIdx, Builder.getInt32(FuncOffset), "OffsetCounterIdx");
+
+    Counters = ContiguousCnts;
+    Value *Indices[] = {Builder.getInt32(0), OffsetCounterIdx};
+    Addr = Builder.CreateInBoundsGEP(Counters->getValueType(), Counters,
+                                     Indices, "ctr.addr");
+
+    // Uniform counters also use contiguous array
+    if (ContiguousUCnts) {
+      UniformCounters = ContiguousUCnts;
+      Value *UniformIndices[] = {Builder.getInt32(0), OffsetCounterIdx};
+      UniformAddr = Builder.CreateInBoundsGEP(UniformCounters->getValueType(),
+                                              UniformCounters, UniformIndices,
+                                              "unifctr.addr");
+    }
+  } else {
+    // Per-function allocation mode (non-GPU or fallback)
+    Counters = getOrCreateRegionCounters(Inc);
+    Value *Indices[] = {Builder.getInt32(0), CounterIdx};
+    Addr = Builder.CreateInBoundsGEP(Counters->getValueType(), Counters,
+                                     Indices, "ctr.addr");
+
+    // Uniform counter address (for divergence tracking)
+    UniformCounters = getOrCreateUniformCounters(Inc);
+    if (UniformCounters) {
+      Value *UniformIndices[] = {Builder.getInt32(0), CounterIdx};
+      UniformAddr = Builder.CreateInBoundsGEP(UniformCounters->getValueType(),
+                                              UniformCounters, UniformIndices,
+                                              "unifctr.addr");
+    }
+  }
+
+  // --- Increment ---
+  if (OffloadPGOSampling == OffloadPGOSamplingMode::PatternOverflow) {
+    // Non-atomic increment by (Inc->getStep() * numActive) (legacy mode)
+    Type *CounterTy =
+        cast<ArrayType>(Counters->getValueType())->getElementType();
+    Value *Load = Builder.CreateLoad(CounterTy, Addr, "pgocount");
+    Value *ProdToCounterTy = Builder.CreateZExtOrTrunc(
+        StepTimesActive, CounterTy, "step_times_active.cast");
+    auto *Count = Builder.CreateAdd(Load, ProdToCounterTy, "pgocount.next");
+    Builder.CreateStore(Count, Addr);
+
+    // Also update uniform counter if uniform
+    if (UniformAddr) {
+      Value *UniformLoad =
+          Builder.CreateLoad(CounterTy, UniformAddr, "unifcount");
+      // Only add to uniform counter if IsUniform is true
+      Value *UniformIncr =
+          Builder.CreateSelect(IsUniform, ProdToCounterTy,
+                               ConstantInt::get(CounterTy, 0), "unifincr");
+      auto *UniformCount =
+          Builder.CreateAdd(UniformLoad, UniformIncr, "unifcount.next");
+      Builder.CreateStore(UniformCount, UniformAddr);
+    }
+  } else {
+    // AtomicWarpLeader: only the leader performs atomicrmw add (step *
+    // numActive) Correct control-flow: split block at Inc, create ThenBB, and
+    // conditional branch.
+
+    // 1) Split the current block before Inc. The split inserts an unconditional
+    //    branch from CurBB to ContBB; we'll replace it with a conditional
+    //    branch.
+    BasicBlock *CurBB = Builder.GetInsertBlock();
+    Function *F = CurBB->getParent();
+    BasicBlock *ContBB =
+        CurBB->splitBasicBlock(BasicBlock::iterator(Inc), "atomic_cont");
+
+    // After split, CurBB ends with "br label %atomic_cont".
+    // 2) Create the ThenBB (atomic path).
+    BasicBlock *ThenBB = BasicBlock::Create(Context, "atomic_then", F);
+
+    // 3) Replace the terminator in CurBB with a conditional branch to ThenBB or
+    // ContBB.
+    Instruction *OldTerm =
+        CurBB->getTerminator(); // unconditional branch inserted by split
+    OldTerm->eraseFromParent();
+    IRBuilder<> HeadBuilder(CurBB);
+    HeadBuilder.CreateCondBr(IsWriter, ThenBB, ContBB);
+
+    // 4) Emit the atomicrmw in ThenBB, then branch to ContBB.
+    IRBuilder<> ThenBuilder(ThenBB);
+    Type *CounterTy =
+        cast<ArrayType>(Counters->getValueType())->getElementType();
+    Value *ProdToCounterTy = ThenBuilder.CreateZExtOrTrunc(
+        StepTimesActive, CounterTy, "step_times_active.cast");
+    ThenBuilder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, ProdToCounterTy,
+                                MaybeAlign(Align(8)),
+                                AtomicOrdering::Monotonic);
+
+    // Also update uniform counter if uniform (inside the ThenBB, so leader does
+    // it)
+    if (UniformAddr) {
+      // Create a nested conditional: only update uniform counter if IsUniform
+      BasicBlock *UniformBB = BasicBlock::Create(Context, "uniform_then", F);
+      BasicBlock *AfterUniformBB =
+          BasicBlock::Create(Context, "uniform_cont", F);
+
+      ThenBuilder.CreateCondBr(IsUniform, UniformBB, AfterUniformBB);
+
+      IRBuilder<> UniformBuilder(UniformBB);
+      UniformBuilder.CreateAtomicRMW(AtomicRMWInst::Add, UniformAddr,
+                                     ProdToCounterTy, MaybeAlign(Align(8)),
+                                     AtomicOrdering::Monotonic);
+      UniformBuilder.CreateBr(AfterUniformBB);
+
+      IRBuilder<> AfterUniformBuilder(AfterUniformBB);
+      AfterUniformBuilder.CreateBr(ContBB);
+    } else {
+      ThenBuilder.CreateBr(ContBB);
+    }
+
+    // 5) Continue in the continuation block and erase the original Inc.
+    Builder.SetInsertPoint(ContBB, ContBB->begin());
+  }
+
+  Inc->eraseFromParent();
+}
+
 void InstrLowerer::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
   ConstantArray *Names =
       cast<ConstantArray>(CoverageNamesVar->getInitializer());
@@ -1611,7 +2067,15 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc,
   Ptr->setVisibility(Visibility);
   // Put the counters and bitmaps in their own sections so linkers can
   // remove unneeded sections.
-  Ptr->setSection(getInstrProfSectionName(IPSK, TT.getObjectFormat()));
+  // For GPU targets, use per-TU sections with CUID suffix for proper
+  // memory tracking via anchor variable registration.
+  std::string SectionName = getInstrProfSectionName(IPSK, TT.getObjectFormat());
+  if (isGPUProfTarget(M)) {
+    std::string CUID = getCUIDFromModule(M);
+    if (!CUID.empty())
+      SectionName = SectionName + "_" + CUID;
+  }
+  Ptr->setSection(SectionName);
   Ptr->setLinkage(Linkage);
   maybeSetComdat(Ptr, Fn, VarName);
   return Ptr;
@@ -1647,7 +2111,12 @@ InstrLowerer::getOrCreateRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc) {
 GlobalVariable *
 InstrLowerer::createRegionCounters(InstrProfCntrInstBase *Inc, StringRef Name,
                                    GlobalValue::LinkageTypes Linkage) {
+  const unsigned OffloadNumProfilingThreads =
+      (1u << OffloadProfilingThreadBitWidth) - 1;
+
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+  if (TT.isAMDGPU())
+    NumCounters *= (OffloadNumProfilingThreads + 1);
   auto &Ctx = M.getContext();
   GlobalVariable *GV;
   if (isa<InstrProfCoverInst>(Inc)) {
@@ -1676,6 +2145,18 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
   if (PD.RegionCounters)
     return PD.RegionCounters;
 
+  // For GPU targets with contiguous allocation, use the contiguous array
+  // instead of creating a per-function array
+  if (ContiguousCnts) {
+    // Store the contiguous array as RegionCounters for this function
+    // The actual offset is handled in lowerIncrementAMDGPU
+    PD.RegionCounters = ContiguousCnts;
+
+    // Still create the data variable (it will point to the right offset)
+    createDataVariable(Inc);
+    return PD.RegionCounters;
+  }
+
   // If RegionCounters doesn't already exist, create it by first setting up
   // the corresponding profile section.
   auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);
@@ -1723,6 +2204,59 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
   return PD.RegionCounters;
 }
 
+GlobalVariable *
+InstrLowerer::getOrCreateUniformCounters(InstrProfCntrInstBase *Inc) {
+  // Only create uniform counters for AMDGPU targets
+  if (!TT.isAMDGPU())
+    return nullptr;
+
+  GlobalVariable *NamePtr = Inc->getName();
+  auto &PD = ProfileDataMap[NamePtr];
+  if (PD.UniformCounters)
+    return PD.UniformCounters;
+
+  // For contiguous allocation, use the contiguous uniform counter array
+  if (ContiguousUCnts) {
+    PD.UniformCounters = ContiguousUCnts;
+    return PD.UniformCounters;
+  }
+
+  // Ensure RegionCounters exists first (we need the same size)
+  getOrCreateRegionCounters(Inc);
+
+  // Create uniform counters with the same size as region counters
+  const unsigned OffloadNumProfilingThreads =
+      (1u << OffloadProfilingThreadBitWidth) - 1;
+
+  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+  NumCounters *= (OffloadNumProfilingThreads + 1);
+
+  auto &Ctx = M.getContext();
+  auto *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+
+  // Use a different prefix for uniform counters
+  bool Renamed;
+  std::string VarName = getVarName(Inc, "__llvm_prf_unifcnt_", Renamed);
+
+  auto *GV = new GlobalVariable(M, CounterTy, false, NamePtr->getLinkage(),
+                                Constant::getNullValue(CounterTy), VarName);
+  GV->setAlignment(Align(8));
+  GV->setVisibility(NamePtr->getVisibility());
+
+  // For GPU targets, use per-TU sections with CUID suffix
+  std::string SectionName =
+      getInstrProfSectionName(IPSK_ucnts, TT.getObjectFormat());
+  std::string CUID = getCUIDFromModule(M);
+  if (!CUID.empty())
+    SectionName = SectionName + "_" + CUID;
+  GV->setSection(SectionName);
+
+  PD.UniformCounters = GV;
+  CompilerUsedVars.push_back(GV);
+
+  return PD.UniformCounters;
+}
+
 void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   // When debug information is correlated to profile data, a data variable
   // is not needed.
@@ -1784,8 +2318,25 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
         ValuesVar, PointerType::get(Fn->getContext(), 0));
   }
 
+  // NumCounters in __llvm_profile_data is the ORIGINAL counter count,
+  // not the expanded count with slots. The expansion factor is stored
+  // separately in NumOffloadProfilingThreads.
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
-  auto *CounterPtr = PD.RegionCounters;
+
+  // For contiguous allocation, CounterPtr should point to this function's
+  // offset within the contiguous array
+  Constant *CounterPtr;
+  if (ContiguousCnts && PD.RegionCounters == ContiguousCnts) {
+    uint64_t FuncOffset = FunctionCounterOffsets.lookup(NamePtr);
+    // Create a GEP to the function's counter offset
+    CounterPtr = ConstantExpr::getInBoundsGetElementPtr(
+        ContiguousCnts->getValueType(), ContiguousCnts,
+        ArrayRef<Constant *>{
+            ConstantInt::get(Type::getInt64Ty(Ctx), 0),
+            ConstantInt::get(Type::getInt64Ty(Ctx), FuncOffset)});
+  } else {
+    CounterPtr = PD.RegionCounters;
+  }
 
   uint64_t NumBitmapBytes = PD.NumBitmapBytes;
 
@@ -1793,11 +2344,7 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
   auto *Int16Ty = Type::getInt16Ty(Ctx);
   auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
-  Type *DataTypes[] = {
-#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
-#include "llvm/ProfileData/InstrProfData.inc"
-  };
-  auto *DataTy = StructType::get(Ctx, ArrayRef(DataTypes));
+  auto *DataTy = getProfileDataTy();
 
   Constant *FunctionAddr = getFuncAddrForProfData(Fn);
 
@@ -1805,6 +2352,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
     Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
 
+  uint16_t NumOffloadProfilingThreadsVal = 0;
+  if (TT.isAMDGPU())
+    NumOffloadProfilingThreadsVal = (1u << OffloadProfilingThreadBitWidth) - 1;
+
   if (isGPUProfTarget(M)) {
     Linkage = GlobalValue::ExternalLinkage;
     Visibility = GlobalValue::ProtectedVisibility;
@@ -1826,8 +2377,25 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
     Linkage = GlobalValue::PrivateLinkage;
     Visibility = GlobalValue::DefaultVisibility;
   }
-  auto *Data =
-      new GlobalVariable(M, DataTy, false, Linkage, nullptr, DataVarName);
+  GlobalValue *DataVar = nullptr;
+  Constant *DataAddr = nullptr;
+  uint64_t DataIndex = 0;
+  if (ContiguousData) {
+    DataIndex = FunctionDataOffsets.lookup(NamePtr);
+    assert(DataIndex < ContiguousDataInits.size() &&
+           "missing contiguous data slot");
+    DataAddr = ConstantExpr::getInBoundsGetElementPtr(
+        ContiguousData->getValueType(), ContiguousData,
+        ArrayRef<Constant *>{
+            ConstantInt::get(Type::getInt64Ty(Ctx), 0),
+            ConstantInt::get(Type::getInt64Ty(Ctx), DataIndex)});
+  } else {
+    auto *Data =
+        new GlobalVariable(M, DataTy, false, Linkage, nullptr, DataVarName);
+    DataVar = Data;
+    DataAddr = Data;
+  }
+
   Constant *RelativeCounterPtr;
   GlobalVariable *BitmapPtr = PD.RegionBitmaps;
   Constant *RelativeBitmapPtr = ConstantInt::get(IntPtrTy, 0);
@@ -1845,29 +2413,48 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
     DataSectionKind = IPSK_data;
     RelativeCounterPtr =
         ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
-                             ConstantExpr::getPtrToInt(Data, IntPtrTy));
+                             ConstantExpr::getPtrToInt(DataAddr, IntPtrTy));
     if (BitmapPtr != nullptr)
       RelativeBitmapPtr =
           ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy),
-                               ConstantExpr::getPtrToInt(Data, IntPtrTy));
+                               ConstantExpr::getPtrToInt(DataAddr, IntPtrTy));
   }
 
   Constant *DataVals[] = {
 #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
 #include "llvm/ProfileData/InstrProfData.inc"
   };
-  Data->setInitializer(ConstantStruct::get(DataTy, DataVals));
-
-  Data->setVisibility(Visibility);
-  Data->setSection(
-      getInstrProfSectionName(DataSectionKind, TT.getObjectFormat()));
-  Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
-  maybeSetComdat(Data, Fn, CntsVarName);
+  auto *DataInit = ConstantStruct::get(DataTy, DataVals);
+
+  if (ContiguousData) {
+    ContiguousDataInits[DataIndex] = DataInit;
+    auto *Alias = GlobalAlias::create(
+        DataTy, DataAddr->getType()->getPointerAddressSpace(), Linkage,
+        DataVarName, DataAddr, &M);
+    Alias->setVisibility(Visibility);
+    DataVar = Alias;
+  } else {
+    auto *DataGV = cast<GlobalVariable>(DataVar);
+    DataGV->setInitializer(DataInit);
+
+    DataGV->setVisibility(Visibility);
+    // For GPU targets, use per-TU sections with CUID suffix
+    std::string DataSectionName =
+        getInstrProfSectionName(DataSectionKind, TT.getObjectFormat());
+    if (isGPUProfTarget(M)) {
+      std::string CUID = getCUIDFromModule(M);
+      if (!CUID.empty())
+        DataSectionName = DataSectionName + "_" + CUID;
+    }
+    DataGV->setSection(DataSectionName);
+    DataGV->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
+    maybeSetComdat(DataGV, Fn, CntsVarName);
+  }
 
-  PD.DataVar = Data;
+  PD.DataVar = DataVar;
 
   // Mark the data variable as used so that it isn't stripped out.
-  CompilerUsedVars.push_back(Data);
+  CompilerUsedVars.push_back(DataVar);
   // Now that the linkage set by the FE has been passed to the data and counter
   // variables, reset Name variable's linkage and visibility to private so that
   // it can be removed later by the compiler.
@@ -1927,6 +2514,104 @@ void InstrLowerer::emitVNodes() {
   UsedVars.push_back(VNodesVar);
 }
 
+void InstrLowerer::createHIPDynamicModuleRegistration() {
+  if (isGPUProfTarget(M))
+    return;
+  LLVM_DEBUG(llvm::dbgs() << "Entering createHIPDynamicModuleRegistration\n");
+  StringRef FuncNames[] = {"hipModuleLoad", "hipModuleLoadData",
+                           "hipModuleLoadDataEx"};
+  for (StringRef FuncName : FuncNames) {
+    Function *F = M.getFunction(FuncName);
+    if (!F)
+      continue;
+
+    for (User *U : F->users()) {
+      if (auto *CB = dyn_cast<CallBase>(U)) {
+        Instruction *InsertPt = nullptr;
+        // If the call is an invoke instruction, we should insert the
+        // registration call in the normal destination block.
+        if (auto *Invoke = dyn_cast<InvokeInst>(CB)) {
+          InsertPt = &*Invoke->getNormalDest()->getFirstInsertionPt();
+        } else if (CB->isTerminator()) {
+          // If it's another kind of terminator (e.g., callbr), we don't
+          // know the semantics of the successors, so we conservatively
+          // skip it. The hipModuleLoad* functions are not expected to be
+          // used in other terminator instructions.
+          continue;
+        } else {
+          // This is a normal call instruction, so we can insert after it.
+          InsertPt = CB->getNextNode();
+        }
+
+        // If there's no valid insertion point (e.g., a malformed block),
+        // skip.
+        if (!InsertPt)
+          continue;
+
+        IRBuilder<> Builder(InsertPt);
+        auto *VoidTy = Type::getVoidTy(M.getContext());
+        // The second argument to the registration function is the module
+        // handle, which is an out-parameter (hipModule_t*), so its type in the
+        // call is hipModule_t**.
+        auto *ModuleHandlePtrTy = PointerType::get(M.getContext(), 0);
+        auto *Int32Ty = Type::getInt32Ty(M.getContext());
+        auto *RegisterDynamicModuleTy =
+            FunctionType::get(VoidTy, {Int32Ty, ModuleHandlePtrTy}, false);
+        FunctionCallee RegisterFunc = M.getOrInsertFunction(
+            "__llvm_profile_offload_register_dynamic_module",
+            RegisterDynamicModuleTy);
+
+        // The first argument to the registration function is the return value
+        // of the hipModuleLoad* call (hipError_t, which is i32).
+        Value *ReturnValue = CB;
+        // The second argument is the module handle itself, which is the first
+        // argument to the hipModuleLoad* call.
+        Value *ModuleHandle = CB->getArgOperand(0);
+
+        auto *Call =
+            Builder.CreateCall(RegisterFunc, {ReturnValue, ModuleHandle});
+        LLVM_DEBUG(llvm::dbgs() << "Register HIP module loaded by "; CB->dump();
+                   llvm::dbgs() << "BB:\n"; Call->getParent()->dump(););
+      }
+    }
+  }
+}
+
+void InstrLowerer::createHIPDynamicModuleUnregistration() {
+  LLVM_DEBUG(llvm::dbgs() << "Entering createHIPDynamicModuleUnregistration\n");
+  Function *F = M.getFunction("hipModuleUnload");
+  if (!F)
+    return;
+
+  for (User *U : F->users()) {
+    if (auto *CB = dyn_cast_or_null<CallBase>(U)) {
+      // The insertion point is right before the call to hipModuleUnload.
+      Instruction *InsertPt = CB;
+
+      IRBuilder<> Builder(InsertPt);
+      auto *VoidTy = Type::getVoidTy(M.getContext());
+      auto *VoidPtrTy = PointerType::getUnqual(M.getContext());
+
+      auto *UnregisterDynamicModuleTy =
+          FunctionType::get(VoidTy, {VoidPtrTy}, false);
+      FunctionCallee UnregisterFunc =
+          M.getOrInsertFunction("__llvm_profile_hip_unregister_dynamic_module",
+                                UnregisterDynamicModuleTy);
+
+      // The argument is the module handle, which is the first
+      // argument to the hipModuleUnload call.
+      Value *ModuleHandle = CB->getArgOperand(0);
+      Value *CastedModuleHandle =
+          Builder.CreatePointerCast(ModuleHandle, VoidPtrTy);
+
+      auto *Call = Builder.CreateCall(UnregisterFunc, {CastedModuleHandle});
+      LLVM_DEBUG(llvm::dbgs() << "Unregister HIP module unloaded by ";
+                 CB->dump(); llvm::dbgs() << "BB:\n";
+                 Call->getParent()->dump(););
+    }
+  }
+}
+
 void InstrLowerer::emitNameData() {
   if (ReferencedNames.empty())
     return;
@@ -1940,9 +2625,15 @@ void InstrLowerer::emitNameData() {
   auto &Ctx = M.getContext();
   auto *NamesVal =
       ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false);
-  NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
-                                GlobalValue::PrivateLinkage, NamesVal,
-                                getInstrProfNamesVarName());
+  std::string NamesVarName = std::string(getInstrProfNamesVarName());
+  if (isGPUProfTarget(M)) {
+    std::string CUID = CachedCUID.empty() ? getCUIDFromModule(M) : CachedCUID;
+    if (!CUID.empty())
+      NamesVarName = NamesVarName + "_" + CUID;
+  }
+  NamesVar =
+      new GlobalVariable(M, NamesVal->getType(), true,
+                         GlobalValue::PrivateLinkage, NamesVal, NamesVarName);
   if (isGPUProfTarget(M)) {
     NamesVar->setLinkage(GlobalValue::ExternalLinkage);
     NamesVar->setVisibility(GlobalValue::ProtectedVisibility);
@@ -1950,10 +2641,17 @@ void InstrLowerer::emitNameData() {
 
   NamesSize = CompressedNameStr.size();
   setGlobalVariableLargeSection(TT, *NamesVar);
-  NamesVar->setSection(
+  // For GPU targets, use per-TU sections with CUID suffix
+  std::string NamesSectionName =
       ProfileCorrelate == InstrProfCorrelator::BINARY
           ? getInstrProfSectionName(IPSK_covname, TT.getObjectFormat())
-          : getInstrProfSectionName(IPSK_name, TT.getObjectFormat()));
+          : getInstrProfSectionName(IPSK_name, TT.getObjectFormat());
+  if (isGPUProfTarget(M)) {
+    std::string CUID = getCUIDFromModule(M);
+    if (!CUID.empty())
+      NamesSectionName = NamesSectionName + "_" + CUID;
+  }
+  NamesVar->setSection(NamesSectionName);
   // On COFF, it's important to reduce the alignment down to 1 to prevent the
   // linker from inserting padding before the start of the names section or
   // between names entries.
@@ -2160,3 +2858,466 @@ void createProfileSamplingVar(Module &M) {
   appendToCompilerUsed(M, SamplingVar);
 }
 } // namespace llvm
+
+namespace {
+
+// For GPU targets: Allocate contiguous arrays for all profile data.
+// This solves the linker reordering problem by using ONE symbol per section
+// type, so there's nothing for the linker to reorder.
+StructType *InstrLowerer::getProfileDataTy() {
+  if (ProfileDataTy)
+    return ProfileDataTy;
+
+  auto &Ctx = M.getContext();
+  auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
+  auto *Int16Ty = Type::getInt16Ty(Ctx);
+  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
+  Type *DataTypes[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  ProfileDataTy = StructType::get(Ctx, ArrayRef(DataTypes));
+  return ProfileDataTy;
+}
+
+void InstrLowerer::finalizeContiguousProfileData() {
+  if (!ContiguousData || ContiguousDataInits.empty())
+    return;
+
+  auto *DataTy = getProfileDataTy();
+  for (auto &Entry : ContiguousDataInits)
+    if (!Entry)
+      Entry = Constant::getNullValue(DataTy);
+
+  auto *DataArrayTy = cast<ArrayType>(ContiguousData->getValueType());
+  ContiguousData->setInitializer(
+      ConstantArray::get(DataArrayTy, ContiguousDataInits));
+}
+
+void InstrLowerer::allocateContiguousProfileArrays() {
+  LLVM_DEBUG(llvm::dbgs() << "allocateContiguousProfileArrays() called\n");
+
+  // Only for GPU device targets
+  if (!isGPUProfTarget(M)) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Not a GPU target, skipping contiguous allocation\n");
+    return;
+  }
+
+  // Get and cache the CUID for consistent section naming
+  CachedCUID = getCUIDFromModule(M);
+  if (CachedCUID.empty()) {
+    LLVM_DEBUG(llvm::dbgs() << "No CUID found in module, using fallback\n");
+    CachedCUID = std::to_string(std::hash<std::string>{}(M.getName().str()));
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "Allocating contiguous arrays for CUID="
+                          << CachedCUID << "\n");
+
+  // First pass: collect all instrprof intrinsics and count total counters
+  const unsigned KSlots = 1u << OffloadProfilingThreadBitWidth;
+  TotalCounterSlots = 0;
+  TotalDataEntries = 0;
+
+  // We need to iterate through all functions and collect the first profiling
+  // intrinsic from each, which determines the counter size for that function.
+  SmallVector<std::pair<GlobalVariable *, uint64_t>, 16> FunctionCounters;
+
+  for (Function &F : M) {
+    for (BasicBlock &BB : F) {
+      for (Instruction &I : BB) {
+        if (auto *Inc = dyn_cast<InstrProfIncrementInst>(&I)) {
+          GlobalVariable *NamePtr = Inc->getName();
+          // Only count each function once
+          if (FunctionCounterOffsets.count(NamePtr) == 0) {
+            uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+            uint64_t NumSlots = NumCounters * KSlots;
+
+            FunctionCounterOffsets[NamePtr] = TotalCounterSlots;
+            FunctionDataOffsets[NamePtr] = TotalDataEntries;
+            FunctionCounters.push_back({NamePtr, NumSlots});
+
+            TotalCounterSlots += NumSlots;
+            TotalDataEntries++;
+
+            LLVM_DEBUG(llvm::dbgs()
+                       << "  Function " << getPGOFuncNameVarInitializer(NamePtr)
+                       << ": " << NumCounters << " counters, " << NumSlots
+                       << " slots, offset=" << (TotalCounterSlots - NumSlots)
+                       << "\n");
+          }
+          break; // Only need first intrinsic per function
+        }
+        if (auto *Cover = dyn_cast<InstrProfCoverInst>(&I)) {
+          GlobalVariable *NamePtr = Cover->getName();
+          if (FunctionCounterOffsets.count(NamePtr) == 0) {
+            uint64_t NumCounters = Cover->getNumCounters()->getZExtValue();
+            // Coverage uses i8 counters, but for simplicity we still allocate
+            // as if slots
+            FunctionCounterOffsets[NamePtr] = TotalCounterSlots;
+            FunctionDataOffsets[NamePtr] = TotalDataEntries;
+            FunctionCounters.push_back({NamePtr, NumCounters});
+
+            TotalCounterSlots += NumCounters;
+            TotalDataEntries++;
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "Total: " << TotalCounterSlots
+                          << " counter slots, " << TotalDataEntries
+                          << " data entries\n");
+
+  if (TotalCounterSlots == 0) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "No counters found, skipping contiguous array creation\n");
+    return;
+  }
+
+  auto &Ctx = M.getContext();
+  auto *Int64Ty = Type::getInt64Ty(Ctx);
+
+  // Create contiguous counter array
+  auto *CntsArrayTy = ArrayType::get(Int64Ty, TotalCounterSlots);
+  std::string CntsSectionName = "__llvm_prf_cnts_" + CachedCUID;
+  ContiguousCnts = new GlobalVariable(
+      M, CntsArrayTy, /*isConstant=*/false, GlobalValue::ExternalLinkage,
+      Constant::getNullValue(CntsArrayTy), "__profc_all_" + CachedCUID);
+  ContiguousCnts->setSection(CntsSectionName);
+  ContiguousCnts->setAlignment(Align(8));
+  ContiguousCnts->setVisibility(GlobalValue::ProtectedVisibility);
+  CompilerUsedVars.push_back(ContiguousCnts);
+
+  // Create contiguous uniform counter array (for AMDGPU divergence tracking)
+  std::string UCntsSectionName = "__llvm_prf_ucnts_" + CachedCUID;
+  ContiguousUCnts = new GlobalVariable(
+      M, CntsArrayTy, /*isConstant=*/false, GlobalValue::ExternalLinkage,
+      Constant::getNullValue(CntsArrayTy), "__profu_all_" + CachedCUID);
+  ContiguousUCnts->setSection(UCntsSectionName);
+  ContiguousUCnts->setAlignment(Align(8));
+  ContiguousUCnts->setVisibility(GlobalValue::ProtectedVisibility);
+  CompilerUsedVars.push_back(ContiguousUCnts);
+
+  LLVM_DEBUG(llvm::dbgs() << "Created contiguous arrays: "
+                          << ContiguousCnts->getName() << " ("
+                          << TotalCounterSlots << " slots), "
+                          << ContiguousUCnts->getName() << "\n");
+
+  if (TotalDataEntries > 0) {
+    auto *DataTy = getProfileDataTy();
+    auto *DataArrayTy = ArrayType::get(DataTy, TotalDataEntries);
+    std::string DataSectionName = getInstrProfSectionName(
+        ProfileCorrelate == InstrProfCorrelator::BINARY ? IPSK_covdata
+                                                        : IPSK_data,
+        TT.getObjectFormat());
+    DataSectionName = DataSectionName + "_" + CachedCUID;
+
+    ContiguousData = new GlobalVariable(M, DataArrayTy, /*isConstant=*/false,
+                                        GlobalValue::ExternalLinkage, nullptr,
+                                        "__profd_all_" + CachedCUID);
+    ContiguousData->setSection(DataSectionName);
+    ContiguousData->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
+    ContiguousData->setVisibility(GlobalValue::ProtectedVisibility);
+    CompilerUsedVars.push_back(ContiguousData);
+
+    ContiguousDataInits.assign(TotalDataEntries,
+                               Constant::getNullValue(DataTy));
+  }
+}
+
+// Create __llvm_offload_prf structure for GPU targets.
+// Uses the contiguous arrays allocated by allocateContiguousProfileArrays().
+void InstrLowerer::createProfileSectionSymbols() {
+  LLVM_DEBUG(llvm::dbgs() << "createProfileSectionSymbols() called\n");
+
+  // Only create symbols for device targets (GPU)
+  if (!isGPUProfTarget(M)) {
+    LLVM_DEBUG(llvm::dbgs() << "Not a GPU target, skipping symbol creation\n");
+    return;
+  }
+
+  // No contiguous arrays = no profiling in this TU
+  if (!ContiguousCnts) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "No contiguous counters, skipping symbol creation\n");
+    return;
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "Creating profile symbols for CUID=" << CachedCUID
+                          << "\n");
+
+  auto &Ctx = M.getContext();
+  auto *Int8Ty = Type::getInt8Ty(Ctx);
+  auto *Int64Ty = Type::getInt64Ty(Ctx);
+
+  // Get address space from the contiguous counters
+  unsigned AS = ContiguousCnts->getType()->getPointerAddressSpace();
+  auto *Int8PtrTy = PointerType::get(Ctx, AS);
+
+  // Calculate sizes
+  uint64_t CntsSize =
+      M.getDataLayout().getTypeAllocSize(ContiguousCnts->getValueType());
+  uint64_t UCntsSize =
+      M.getDataLayout().getTypeAllocSize(ContiguousUCnts->getValueType());
+
+  // Data section boundaries.
+  GlobalValue *DataStart = nullptr;
+  GlobalValue *DataEndBase = nullptr;
+  uint64_t DataSize = 0;
+  if (ContiguousData) {
+    DataStart = ContiguousData;
+    DataEndBase = ContiguousData;
+    DataSize =
+        M.getDataLayout().getTypeAllocSize(ContiguousData->getValueType());
+  } else {
+    // Legacy per-function data variables: best-effort by scanning.
+    GlobalVariable *FirstData = nullptr;
+    GlobalVariable *LastData = nullptr;
+    for (auto &PD : ProfileDataMap) {
+      if (auto *GV = dyn_cast_or_null<GlobalVariable>(PD.second.DataVar)) {
+        if (!FirstData)
+          FirstData = GV;
+        LastData = GV;
+      }
+    }
+    DataStart = FirstData;
+    DataEndBase = LastData;
+    if (LastData)
+      DataSize = M.getDataLayout().getTypeAllocSize(LastData->getValueType());
+  }
+
+  LLVM_DEBUG({
+    llvm::dbgs() << "Section sizes: Cnts=" << CntsSize << " UCnts=" << UCntsSize
+                 << " Data=" << DataSize << " Names=" << NamesSize << "\n";
+  });
+
+  // Helper to get start pointer
+  auto getStartPtr = [&](GlobalValue *GV) -> Constant * {
+    if (!GV)
+      return Constant::getNullValue(Int8PtrTy);
+    return ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy);
+  };
+
+  // Helper to get end pointer (base + size)
+  auto getEndPtr = [&](GlobalValue *GV, uint64_t Size) -> Constant * {
+    if (!GV)
+      return Constant::getNullValue(Int8PtrTy);
+    auto *BasePtr =
+        ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy);
+    return ConstantExpr::getGetElementPtr(Int8Ty, BasePtr,
+                                          ConstantInt::get(Int64Ty, Size));
+  };
+
+  // Build the __llvm_offload_prf structure
+  // Order: cnts_start, data_start, names_start, ucnts_start, cnts_end,
+  // data_end, names_end, ucnts_end
+  std::vector<Type *> StructFields(8, Int8PtrTy);
+  std::vector<Constant *> StructValues = {
+      getStartPtr(ContiguousCnts),          // cnts_start
+      getStartPtr(DataStart),               // data_start
+      getStartPtr(NamesVar),                // names_start
+      getStartPtr(ContiguousUCnts),         // ucnts_start
+      getEndPtr(ContiguousCnts, CntsSize),  // cnts_end
+      getEndPtr(DataEndBase, DataSize),     // data_end
+      getEndPtr(NamesVar, NamesSize),       // names_end
+      getEndPtr(ContiguousUCnts, UCntsSize) // ucnts_end
+  };
+
+  auto *UnifiedStructTy = StructType::get(Ctx, StructFields);
+  auto *UnifiedStructInit = ConstantStruct::get(UnifiedStructTy, StructValues);
+
+  // Use CUID-suffixed name to avoid symbol collision in multi-TU programs.
+  // For static modules, the host side registers each TU's shadow variable.
+  // For dynamic modules (hipModuleLoad), the runtime would need to enumerate
+  // symbols matching __llvm_offload_prf_* pattern (future enhancement).
+  std::string OffloadPrfName = "__llvm_offload_prf_" + CachedCUID;
+  auto *UnifiedStruct = new GlobalVariable(
+      M, UnifiedStructTy, /*isConstant=*/true, GlobalValue::ExternalLinkage,
+      UnifiedStructInit, OffloadPrfName);
+  UnifiedStruct->setVisibility(GlobalValue::DefaultVisibility);
+  CompilerUsedVars.push_back(UnifiedStruct);
+
+  LLVM_DEBUG(llvm::dbgs() << "Created " << OffloadPrfName
+                          << " with contiguous arrays\n");
+}
+
+// Create HIP device variable registration for profile symbols
+void InstrLowerer::createHIPDeviceVariableRegistration() {
+  LLVM_DEBUG(llvm::dbgs() << "createHIPDeviceVariableRegistration called\n");
+  if (isGPUProfTarget(M)) {
+    LLVM_DEBUG(llvm::dbgs() << "GPU target, skipping registration\n");
+    return;
+  }
+
+  // Get the CUID from the module (same as device side)
+  std::string CUID = getCUIDFromModule(M);
+  if (CUID.empty()) {
+    LLVM_DEBUG(llvm::dbgs() << "No CUID found, skipping registration\n");
+    return;
+  }
+
+  // Find the existing __hip_module_ctor function
+  Function *Ctor = M.getFunction("__hip_module_ctor");
+  if (!Ctor) {
+    LLVM_DEBUG(llvm::dbgs() << "No __hip_module_ctor function found\n");
+    // M.dump();
+    //  No HIP compilation context, skip registration
+    return;
+  }
+
+  // Locate the HIP fat-binary registration call and capture its return value
+  Value *Handle = nullptr;
+  for (BasicBlock &BB : *Ctor)
+    for (Instruction &I : BB)
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (Function *Callee = CB->getCalledFunction())
+          if (Callee->getName() == "__hipRegisterFatBinary") {
+            Handle = &I; // call result
+            break;
+          }
+  if (!Handle) {
+    LLVM_DEBUG(llvm::dbgs() << "__hipRegisterFatBinary call not found\n");
+    return;
+  }
+  GlobalVariable *FatbinHandleGV = nullptr;
+  if (auto *HandleInst = dyn_cast<Instruction>(Handle))
+    for (Instruction *Cur = HandleInst->getNextNode(); Cur;
+         Cur = Cur->getNextNode()) {
+      auto *SI = dyn_cast<StoreInst>(Cur);
+      if (!SI || SI->getValueOperand() != Handle)
+        continue;
+      if (auto *GV = dyn_cast<GlobalVariable>(
+              SI->getPointerOperand()->stripPointerCasts())) {
+        FatbinHandleGV = GV;
+        break;
+      }
+    }
+
+  if (!FatbinHandleGV) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "store of __hipRegisterFatBinary call not found\n");
+  }
+
+  // Insert the new registration just before the ctor’s return
+  ReturnInst *RetInst = nullptr;
+  for (auto &BB : llvm::reverse(*Ctor))
+    if ((RetInst = dyn_cast<ReturnInst>(BB.getTerminator())))
+      break;
+  if (!RetInst) {
+    LLVM_DEBUG(llvm::dbgs() << "No return instruction found in ctor\n");
+    return;
+  }
+  IRBuilder<> Builder(RetInst);
+
+  LLVM_DEBUG(
+      llvm::dbgs() << "Found __hip_module_ctor, registering anchors for CUID="
+                   << CUID << "\n");
+
+  // Get or create the __hipRegisterVar declaration
+  auto *VoidTy = Type::getVoidTy(M.getContext());
+  auto *VoidPtrTy = PointerType::getUnqual(M.getContext());
+  auto *Int32Ty = Type::getInt32Ty(M.getContext());
+  auto *Int64Ty = Type::getInt64Ty(M.getContext());
+
+  auto *RegisterVarTy =
+      FunctionType::get(VoidTy,
+                        {VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, Int32Ty,
+                         Int64Ty, Int32Ty, Int32Ty},
+                        false);
+  FunctionCallee RegisterVarFunc =
+      M.getOrInsertFunction("__hipRegisterVar", RegisterVarTy);
+
+  Value *HipHandle =
+      FatbinHandleGV ? Builder.CreateLoad(VoidPtrTy, FatbinHandleGV) : Handle;
+
+  // Create __llvm_offload_prf_<CUID> shadow structure on host
+  // This will be populated with section boundary addresses from the device
+  // Use CUID-suffixed name to match device symbol and avoid multi-TU collision
+  std::string OffloadPrfName = "__llvm_offload_prf_" + CUID;
+  auto *Int8PtrTy = PointerType::get(M.getContext(), 0);
+  std::vector<Type *> StructFields(8, Int8PtrTy);
+  auto *StructTy = StructType::get(M.getContext(), StructFields);
+
+  auto *OffloadPrfShadow = new GlobalVariable(
+      M, StructTy, /*isConstant=*/false, GlobalValue::ExternalLinkage,
+      ConstantAggregateZero::get(StructTy), OffloadPrfName);
+  CompilerUsedVars.push_back(OffloadPrfShadow);
+
+  // Register the unified structure with HIP runtime
+  auto *UnifiedNameStr =
+      ConstantDataArray::getString(M.getContext(), OffloadPrfName, true);
+  auto *UnifiedNameGlobal = new GlobalVariable(
+      M, UnifiedNameStr->getType(), /*isConstant=*/true,
+      GlobalValue::PrivateLinkage, UnifiedNameStr, OffloadPrfName + ".name");
+
+  Builder.CreateCall(RegisterVarFunc,
+                     {HipHandle,
+                      Builder.CreatePointerCast(OffloadPrfShadow, VoidPtrTy),
+                      Builder.CreatePointerCast(UnifiedNameGlobal, VoidPtrTy),
+                      Builder.CreatePointerCast(UnifiedNameGlobal, VoidPtrTy),
+                      Builder.getInt32(0),   // extern = 0
+                      Builder.getInt64(64),  // size = 64 (8 pointers * 8 bytes)
+                      Builder.getInt32(0),   // constant = 0
+                      Builder.getInt32(0)}); // global = 0
+
+  // Register with the profile runtime so it knows to collect data from this TU
+  auto *RegisterShadowVarTy = FunctionType::get(VoidTy, {VoidPtrTy}, false);
+  FunctionCallee RegisterShadowVarFunc = M.getOrInsertFunction(
+      "__llvm_profile_offload_register_shadow_variable", RegisterShadowVarTy);
+  Builder.CreateCall(RegisterShadowVarFunc,
+                     {Builder.CreatePointerCast(OffloadPrfShadow, VoidPtrTy)});
+
+  // Register per-section device symbols so compiler-rt can pre-register them
+  // with CLR before doing hipMemcpy (avoids HSA dependency).
+  FunctionCallee RegisterSectionShadowVarFunc = M.getOrInsertFunction(
+      "__llvm_profile_offload_register_section_shadow_variable",
+      RegisterShadowVarTy);
+
+  auto registerSectionSymbol = [&](StringRef SymName) {
+    // Create a 1-byte shadow global. The type/size are only used as a handle.
+    auto *I8Ty = Type::getInt8Ty(M.getContext());
+    GlobalVariable *Shadow = M.getGlobalVariable(SymName);
+    if (!Shadow) {
+      Shadow = new GlobalVariable(M, I8Ty, /*isConstant=*/false,
+                                  GlobalValue::ExternalLinkage,
+                                  ConstantInt::get(I8Ty, 0), SymName);
+      CompilerUsedVars.push_back(Shadow);
+    }
+
+    auto *NameStr = ConstantDataArray::getString(M.getContext(), SymName, true);
+    auto *NameGlobal = new GlobalVariable(
+        M, NameStr->getType(), /*isConstant=*/true, GlobalValue::PrivateLinkage,
+        NameStr, (SymName + ".name").str());
+
+    Builder.CreateCall(RegisterVarFunc,
+                       {HipHandle, Builder.CreatePointerCast(Shadow, VoidPtrTy),
+                        Builder.CreatePointerCast(NameGlobal, VoidPtrTy),
+                        Builder.CreatePointerCast(NameGlobal, VoidPtrTy),
+                        Builder.getInt32(0), // extern = 0
+                        Builder.getInt64(1), // size = 1 byte (handle only)
+                        Builder.getInt32(0), // constant = 0
+                        Builder.getInt32(0)} // global = 0
+    );
+
+    Builder.CreateCall(RegisterSectionShadowVarFunc,
+                       {Builder.CreatePointerCast(Shadow, VoidPtrTy)});
+  };
+
+  // Per-TU contiguous symbols (device side).
+  std::string CntsSym = std::string("__profc_all_") + CUID;
+  std::string DataSym = std::string("__profd_all_") + CUID;
+  std::string UCntsSym = std::string("__profu_all_") + CUID;
+  std::string NamesSym = std::string(getInstrProfNamesVarName()) + "_" + CUID;
+  registerSectionSymbol(CntsSym);
+  registerSectionSymbol(DataSym);
+  registerSectionSymbol(UCntsSym);
+  registerSectionSymbol(NamesSym);
+
+  LLVM_DEBUG(llvm::dbgs() << "Registered " << OffloadPrfName
+                          << " for CUID=" << CUID << "\n");
+}
+
+} // namespace
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 02f06bebb8f0d..a24b8718d0ce9 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -118,6 +118,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <cstdlib>
 #include <memory>
 #include <numeric>
 #include <optional>
@@ -1201,6 +1202,9 @@ class PGOUseFunc {
   // Annotate the irreducible loop header weights.
   void annotateIrrLoopHeaderWeights();
 
+  // Annotate per-block uniformity info for offload profiling.
+  void setBlockUniformityAttribute();
+
   // The hotness of the function from the profile count.
   enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
 
@@ -1307,11 +1311,34 @@ bool PGOUseFunc::setInstrumentedCounts(
 
   setupBBInfoEdges(FuncInfo);
 
-  unsigned NumCounters =
-      InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
+  unsigned NumInstrumentedBBs = InstrumentBBs.size();
+  unsigned NumSelects = FuncInfo.SIVisitor.getNumOfSelectInsts();
+  unsigned NumCounters = NumInstrumentedBBs + NumSelects;
   // The number of counters here should match the number of counters
   // in profile. Return if they mismatch.
   if (NumCounters != CountFromProfile.size()) {
+    LLVM_DEBUG({
+      dbgs() << "PGO COUNTER MISMATCH for function " << F.getName() << ":\n";
+      dbgs() << "  Expected counters: " << NumCounters << "\n";
+      dbgs() << "    - From instrumented edges: " << NumInstrumentedBBs << "\n";
+      for (size_t i = 0; i < InstrumentBBs.size(); ++i) {
+        dbgs() << "      " << i << ": " << InstrumentBBs[i]->getName() << "\n";
+      }
+      dbgs() << "    - From select instructions: " << NumSelects << "\n";
+      dbgs() << "  Actual counters from profile: " << CountFromProfile.size()
+             << "\n";
+
+      // Dump module
+      std::error_code EC;
+      std::string Filename = "pgo_mismatch_" + F.getName().str() + ".ll";
+      raw_fd_ostream OS(Filename, EC);
+      if (!EC) {
+        dbgs() << "Dumping module to " << Filename << "\n";
+        M->print(OS, nullptr);
+      } else {
+        dbgs() << "Error opening file " << Filename << " for writing\n";
+      }
+    });
     return false;
   }
   auto *FuncEntry = &*F.begin();
@@ -1319,6 +1346,7 @@ bool PGOUseFunc::setInstrumentedCounts(
   // Set the profile count to the Instrumented BBs.
   uint32_t I = 0;
   for (BasicBlock *InstrBB : InstrumentBBs) {
+
     uint64_t CountValue = CountFromProfile[I++];
     PGOUseBBInfo &Info = getBBInfo(InstrBB);
     // If we reach here, we know that we have some nonzero count
@@ -1764,6 +1792,47 @@ void PGOUseFunc::annotateIrrLoopHeaderWeights() {
   }
 }
 
+void PGOUseFunc::setBlockUniformityAttribute() {
+  if (ProfileRecord.UniformityBits.empty())
+    return;
+
+  // Annotate uniformity on each instrumented IR basic block so later codegen
+  // passes (MachineFunction) can consume it without relying on fragile block
+  // numbering heuristics.
+  //
+  // Metadata name: "offload-block-uniformity"
+  // Payload: i1 (true = uniform, false = divergent)
+  static constexpr const char *OffloadBlockUniformityMD =
+      "offload-block-uniformity";
+
+  std::vector<BasicBlock *> InstrumentBBs;
+  FuncInfo.getInstrumentBBs(InstrumentBBs);
+
+  LLVMContext &Ctx = F.getContext();
+  Type *Int1Ty = Type::getInt1Ty(Ctx);
+
+  for (size_t I = 0, E = InstrumentBBs.size(); I < E; ++I) {
+    BasicBlock *BB = InstrumentBBs[I];
+    if (!BB || !BB->getTerminator())
+      continue;
+    bool IsUniform = ProfileRecord.isBlockUniform(I);
+    auto *MD = MDNode::get(
+        Ctx, ConstantAsMetadata::get(ConstantInt::get(Int1Ty, IsUniform)));
+    BB->getTerminator()->setMetadata(OffloadBlockUniformityMD, MD);
+  }
+
+  // Keep a function attribute for debugging / IR inspection.
+  // Format: "U" for uniform, "D" for divergent, one per instrumented block.
+  std::string UniformityStr;
+  UniformityStr.reserve(InstrumentBBs.size());
+  for (size_t I = 0, E = InstrumentBBs.size(); I < E; ++I)
+    UniformityStr += ProfileRecord.isBlockUniform(I) ? 'U' : 'D';
+  F.addFnAttr("offload-block-uniformity", UniformityStr);
+
+  LLVM_DEBUG(dbgs() << "PGO: Set offload block uniformity for " << F.getName()
+                    << ": " << UniformityStr << "\n");
+}
+
 void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
   Module *M = F.getParent();
   IRBuilder<> Builder(&SI);
@@ -2275,6 +2344,7 @@ static bool annotateAllFunctions(
     Func.setBranchWeights();
     Func.annotateValueSites();
     Func.annotateIrrLoopHeaderWeights();
+    Func.setBlockUniformityAttribute();
     PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
     if (FreqAttr == PGOUseFunc::FFA_Cold)
       ColdFunctions.push_back(&F);
@@ -2409,14 +2479,14 @@ void llvm::setProfMetadata(Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
                            uint64_t MaxCount) {
   auto Weights = downscaleWeights(EdgeCounts, MaxCount);
 
-  LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W
-                                           : Weights) {
+  LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W : Weights) {
     dbgs() << W << " ";
-  } dbgs() << "\n";);
+  } dbgs() << "\n");
 
   misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
 
   setBranchWeights(*TI, Weights, /*IsExpected=*/false);
+
   if (EmitBranchProbability) {
     std::string BrCondStr = getBranchCondString(TI);
     if (BrCondStr.empty())
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll
new file mode 100644
index 0000000000000..95ce5c30f7f53
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll
@@ -0,0 +1,41 @@
+;; Test that AMDGPU targets use contiguous counter allocation with CUID-based naming.
+;; This avoids linker reordering issues where individual __profc_* symbols could be
+;; placed in any order within the section.
+
+; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s
+
+;; Simulate a module with CUID (as generated by HIP compilation)
+ at __hip_cuid_abc123 = addrspace(1) global i8 0
+
+ at __profn_kernel1 = private constant [7 x i8] c"kernel1"
+ at __profn_kernel2 = private constant [7 x i8] c"kernel2"
+
+;; Check that contiguous counter array is created with CUID suffix
+; CHECK: @__profc_all_abc123 = protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_cnts_abc123", align 8
+
+;; Check that contiguous uniform counter array is created for divergence tracking
+; CHECK: @__profu_all_abc123 = protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_ucnts_abc123", align 8
+
+;; Check that individual __profc_kernel* symbols are NOT created (contiguous mode)
+; CHECK-NOT: @__profc_kernel1
+; CHECK-NOT: @__profc_kernel2
+
+define amdgpu_kernel void @kernel1() {
+  call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, i32 0)
+  call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, i32 1)
+  ret void
+}
+
+define amdgpu_kernel void @kernel2() {
+  call void @llvm.instrprof.increment(ptr @__profn_kernel2, i64 67890, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
+
+;; Check that __llvm_offload_prf_<CUID> structure is created with 8 pointers
+; CHECK: @__llvm_offload_prf_abc123 = addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) }
+
+;; Per-function data symbols are aliases into the contiguous __profd_all array
+; CHECK: @__profd_kernel1 = protected alias
+; CHECK: @__profd_kernel2 = protected alias
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-uniform-counters.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-uniform-counters.ll
new file mode 100644
index 0000000000000..d326ef67b613a
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-uniform-counters.ll
@@ -0,0 +1,31 @@
+;; Test that AMDGPU targets generate uniform counter instrumentation for
+;; divergence tracking. This enables PGO to detect which blocks execute
+;; uniformly (all lanes active) vs divergently (partial wave execution).
+
+; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s
+
+ at __hip_cuid_test123 = addrspace(1) global i8 0
+ at __profn_test_kernel = private constant [11 x i8] c"test_kernel"
+
+define amdgpu_kernel void @test_kernel() {
+  call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 12345, i32 1, i32 0)
+  ret void
+}
+
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
+
+;; Check that uniform counter array is created
+; CHECK: @__profu_all_test123 = protected addrspace(1) global
+
+;; Check that ballot intrinsic is used to get active mask
+; CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true)
+
+;; Check that ctpop is used to count active lanes
+; CHECK: call i32 @llvm.ctpop.i32
+
+;; Check that uniformity check compares active mask to full wave mask (0xFFFFFFFF)
+; CHECK: icmp eq i32 %{{.*}}, -1
+
+;; Check that uniform counter is conditionally updated based on uniformity
+;; The atomic wave leader mode uses a branch on isUniform
+; CHECK: br i1 %isUniform, label %uniform_then
diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage.ll b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
index 08cbcaa962b76..75fd18a94940d 100644
--- a/llvm/test/Instrumentation/InstrProfiling/coverage.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
@@ -5,12 +5,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; CHECK: @__profc_foo = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
-; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
+; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
+; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
 @__profn_bar = private constant [3 x i8] c"bar"
 ; CHECK: @__profc_bar = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
-; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
+; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
+; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
 
 ; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names"
 ; BINARY: @__llvm_prf_nm ={{.*}} section "__llvm_covnames"
diff --git a/llvm/test/Instrumentation/InstrProfiling/inline-data-var-create.ll b/llvm/test/Instrumentation/InstrProfiling/inline-data-var-create.ll
index 456103164378e..b8c2bc2f04f9f 100644
--- a/llvm/test/Instrumentation/InstrProfiling/inline-data-var-create.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/inline-data-var-create.ll
@@ -7,17 +7,18 @@
 
 target triple = "x86_64-unknown-linux-gnu"
 
-; INLINEFIRST: @__profd_foo = private global{{.*}}zeroinitializer, i32 21
-; INLINEFIRST: @__profd_bar = private global{{.*}}zeroinitializer, i32 23
-; INLINEFIRST: @__profd_foobar = private global{{.*}}zeroinitializer, i32 99
-
-; INLINEAFTER: @__profd_foobar = private global{{.*}}zeroinitializer, i32 99
-; INLINEAFTER: @__profd_foo = private global{{.*}}zeroinitializer, i32 21
-; INLINEAFTER: @__profd_bar = private global{{.*}}zeroinitializer, i32 23
-
-; NOINLINE: @__profd_foobar = private global{{.*}}zeroinitializer, i32 99
-; NOINLINE: @__profd_foo = private global{{.*}}zeroinitializer, i32 21
-; NOINLINE: @__profd_bar = private global{{.*}}zeroinitializer, i32 23
+;; Note: struct layout is { ..., i32, [3 x i16], i16, i32 } where i16 is NumOffloadProfilingThreads
+; INLINEFIRST: @__profd_foo = private global{{.*}}i16 0, i32 21
+; INLINEFIRST: @__profd_bar = private global{{.*}}i16 0, i32 23
+; INLINEFIRST: @__profd_foobar = private global{{.*}}i16 0, i32 99
+
+; INLINEAFTER: @__profd_foobar = private global{{.*}}i16 0, i32 99
+; INLINEAFTER: @__profd_foo = private global{{.*}}i16 0, i32 21
+; INLINEAFTER: @__profd_bar = private global{{.*}}i16 0, i32 23
+
+; NOINLINE: @__profd_foobar = private global{{.*}}i16 0, i32 99
+; NOINLINE: @__profd_foo = private global{{.*}}i16 0, i32 21
+; NOINLINE: @__profd_bar = private global{{.*}}i16 0, i32 23
 
 declare void @llvm.instrprof.increment(ptr %0, i64 %1, i32 %2, i32 %3)
 declare void @llvm.instrprof.mcdc.parameters(ptr %0, i64 %1, i32 %2)
diff --git a/llvm/test/Instrumentation/InstrProfiling/platform.ll b/llvm/test/Instrumentation/InstrProfiling/platform.ll
index 9c76a5caf2a51..eaccf6681a426 100644
--- a/llvm/test/Instrumentation/InstrProfiling/platform.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/platform.ll
@@ -9,26 +9,38 @@
 ; RUN: opt < %s -mtriple=x86_64-pc-windows -passes=instrprof -S | FileCheck %s -check-prefix=WINDOWS
 ; RUN: opt < %s -mtriple=powerpc64-ibm-aix-xcoff -passes=instrprof -S | FileCheck %s -check-prefix=AIX
 ; RUN: opt < %s -mtriple=arm-elf -passes=instrprof -S | FileCheck %s -check-prefix=BAREMETAL
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s -check-prefix=AMDGPU
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; MACHO-NOT: __profn_foo
 ; ELF-NOT: __profn_foo
 ; WINDOWS-NOT: __profn_foo
 ; AIX-NOT: __profn_foo
+; AMDGPU-NOT: __profn_foo
 
 ; MACHO: @__profc_foo = private global [1 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
 ; ELF: @__profc_foo = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
 ; WINDOWS: @__profc_foo = private global [1 x i64] zeroinitializer, section ".lprfc$M", align 8
 ; AIX: @__profc_foo = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", align 8
+;; AMDGPU uses contiguous counter allocation with hash-based naming when no CUID
+; AMDGPU: @__profc_all_{{[0-9]+}} = protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_cnts_{{[0-9]+}}", align 8
+; AMDGPU: @__profu_all_{{[0-9]+}} = protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_ucnts_{{[0-9]+}}", align 8
 
 ; MACHO: @__profd_foo = private {{.*}}, section "__DATA,__llvm_prf_data,regular,live_support", align 8
 ; ELF: @__profd_foo = private {{.*}}, section "__llvm_prf_data", comdat($__profc_foo), align 8
 ; WINDOWS: @__profd_foo = private global {{.*}}, section ".lprfd$M", align 8
 ; AIX: @__profd_foo = private {{.*}}, section "__llvm_prf_data", align 8
-
 ; ELF: @__llvm_prf_nm = private constant [{{.*}} x i8] c"{{.*}}", section "{{.*}}__llvm_prf_names"{{.*}}, align 1
 ; WINDOWS: @__llvm_prf_nm = private constant [{{.*}} x i8] c"{{.*}}", section "{{.*}}lprfn$M", align 1
 ; AIX: @__llvm_prf_nm = private constant [{{.*}} x i8] c"{{.*}}", section "{{.*}}__llvm_prf_names", align 1
+;; AMDGPU uses CUID-suffixed names section
+; AMDGPU: @__llvm_prf_nm_{{[0-9]+}} = protected addrspace(1) constant [{{.*}} x i8] c"{{.*}}", section "__llvm_prf_names", align 1
+
+;; Check for __llvm_offload_prf_<CUID> structure (replaces start/stop symbols for AMDGPU)
+; AMDGPU: @__llvm_offload_prf_{{[0-9]+}} = addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) }
+
+;; AMDGPU uses per-TU contiguous allocation, so @__profd_foo is an alias
+; AMDGPU: @__profd_foo = protected alias
 
 define void @foo() {
   call void @llvm.instrprof.increment(ptr @__profn_foo, i64 0, i32 1, i32 0)
@@ -37,6 +49,10 @@ define void @foo() {
 
 declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
 
+;; Start/stop symbols are NOT created for AMDGPU with contiguous allocation
+; AMDGPU-NOT: @__start___llvm_prf_cnts
+; AMDGPU-NOT: @__stop___llvm_prf_cnts
+
 ;; Emit registration functions for platforms that don't find the
 ;; symbols by their sections.
 
@@ -48,6 +64,7 @@ declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
 ; WINDOWS-NOT: define internal void @__llvm_profile_register_functions
 ; AIX-NOT: define internal void @__llvm_profile_register_functions
 ; BAREMETAL-NOT: define internal void @__llvm_profile_register_functions
+; AMDGPU-NOT: define internal void @__llvm_profile_register_functions
 
 ;; PR38340: When dynamic registration is used, we had a bug where we'd register
 ;; something that's not a __profd_* variable.
@@ -60,3 +77,4 @@ declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
 ; WINDOWS-NOT: define internal void @__llvm_profile_init
 ; AIX-NOT: define internal void @__llvm_profile_init
 ; BAREMETAL-NOT: define internal void @__llvm_profile_init
+; AMDGPU-NOT: define internal void @__llvm_profile_init
diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
index 1bad0db1b4762..a3d7def15c516 100644
--- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
@@ -13,9 +13,9 @@ $foo = comdat any
 ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
 ; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
+; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
 ; CHECK-NOT: @foo
-; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
+; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i16 0, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.compiler.used
 
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
index 56d8364d8f543..7a333362ce9ba 100644
--- a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
+++ b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
@@ -14,7 +14,7 @@ $__llvm_profile_raw_version = comdat any
 
 ; SAMPLE-VAR: @__llvm_profile_sampling = thread_local global i16 0, comdat
 ; SAMPLE-VAR: @__profc_f = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; SAMPLE-VAR: @__profd_f = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 -3706093650706652785, i64 12884901887, i64 sub (i64 ptrtoint (ptr @__profc_f to i64), i64 ptrtoint (ptr @__profd_f to i64)), i64 0, ptr @f.local, ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc_f), align 8
+; SAMPLE-VAR: @__profd_f = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i16, i32 } { i64 -3706093650706652785, i64 12884901887, i64 sub (i64 ptrtoint (ptr @__profc_f to i64), i64 ptrtoint (ptr @__profd_f to i64)), i64 0, ptr @f.local, ptr null, i32 1, [3 x i16] zeroinitializer, i16 0, i32 0 }, section "__llvm_prf_data", comdat($__profc_f), align 8
 ; SAMPLE-VAR: @__llvm_prf_nm = private constant {{.*}}, section "__llvm_prf_names", align 1
 ; SAMPLE-VAR: @llvm.compiler.used = appending global [2 x ptr] [ptr @__llvm_profile_sampling, ptr @__profd_f], section "llvm.metadata"
 ; SAMPLE-VAR: @llvm.used = appending global [1 x ptr] [ptr @__llvm_prf_nm], section "llvm.metadata"
diff --git a/llvm/test/tools/llvm-profdata/profile-version.test b/llvm/test/tools/llvm-profdata/profile-version.test
index e811699ac63ed..7f627cb89c7d0 100644
--- a/llvm/test/tools/llvm-profdata/profile-version.test
+++ b/llvm/test/tools/llvm-profdata/profile-version.test
@@ -2,7 +2,7 @@ Test the profile version.
 
 RUN: llvm-profdata merge -o %t.profdata %p/Inputs/basic.proftext
 RUN: llvm-profdata show --profile-version %t.profdata | FileCheck %s
-CHECK: Profile version: 13
+CHECK: Profile version: 14
 
 RUN: llvm-profdata merge -o %t.prev.profdata %p/Inputs/basic.proftext --write-prev-version
 RUN: llvm-profdata show --profile-version %t.prev.profdata | FileCheck %s --check-prefix=PREV
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 74c4732ca129a..774fb46c5b1cd 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/BalancedPartitioning.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Discriminator.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
@@ -354,6 +355,11 @@ static cl::opt<unsigned> MemprofGenerateRandomHotnessSeed(
     cl::sub(MergeSubcommand),
     cl::desc("Random hotness seed to use (0 to generate new seed)"));
 
+static cl::opt<unsigned> OffloadDeviceWaveSize(
+    "wave-size", cl::init(32), cl::sub(MergeSubcommand),
+    cl::desc("Wave size for AMDGPU offload profiling uniformity detection "
+             "(32 for gfx10/gfx11, 64 for gfx9). Default: 32"));
+
 // Options specific to overlap subcommand.
 static cl::opt<std::string> BaseFilename(cl::Positional, cl::Required,
                                          cl::desc("<base profile file>"),
@@ -655,11 +661,15 @@ struct WriterContext {
 
   WriterContext(bool IsSparse, std::mutex &ErrLock,
                 SmallSet<instrprof_error, 4> &WriterErrorCodes,
-                uint64_t ReservoirSize = 0, uint64_t MaxTraceLength = 0)
+                uint64_t ReservoirSize = 0, uint64_t MaxTraceLength = 0,
+                unsigned WaveSize = 0)
       : Writer(IsSparse, ReservoirSize, MaxTraceLength, DoWritePrevVersion,
                MemProfVersionRequested, MemProfFullSchema,
                MemprofGenerateRandomHotness, MemprofGenerateRandomHotnessSeed),
-        ErrLock(ErrLock), WriterErrorCodes(WriterErrorCodes) {}
+        ErrLock(ErrLock), WriterErrorCodes(WriterErrorCodes) {
+    if (WaveSize > 0)
+      Writer.setOffloadWaveSize(WaveSize);
+  }
 };
 
 /// Computer the overlap b/w profile BaseFilename and TestFileName,
@@ -690,6 +700,62 @@ static void overlapInput(const std::string &BaseFilename,
   }
 }
 
+/// Read uniform counters from a .unifcnts file.
+/// Returns true if the file was successfully read, false otherwise.
+/// The uniform counters are stored in UniformCounters vector.
+static bool readUniformCountersFile(StringRef ProfileFilename,
+                                    std::vector<uint64_t> &UniformCounters) {
+  // Construct the .unifcnts filename by replacing the extension
+  SmallString<256> UniformFilename(ProfileFilename);
+  sys::path::replace_extension(UniformFilename, "unifcnts");
+
+  // Try to open the file
+  auto BufferOrErr = MemoryBuffer::getFile(UniformFilename);
+  if (!BufferOrErr) {
+    // File doesn't exist or can't be read - this is not an error,
+    // just means no uniform counters are available
+    return false;
+  }
+
+  auto &Buffer = *BufferOrErr.get();
+  const char *Data = Buffer.getBufferStart();
+  size_t Size = Buffer.getBufferSize();
+
+  // Minimum size: 4 uint64_t header fields
+  if (Size < 4 * sizeof(uint64_t))
+    return false;
+
+  // Read header
+  uint64_t Magic = support::endian::read64le(Data);
+  uint64_t Version = support::endian::read64le(Data + 8);
+  uint64_t NumCounters = support::endian::read64le(Data + 16);
+  uint64_t CountersSize = support::endian::read64le(Data + 24);
+
+  // Verify magic number
+  const uint64_t ExpectedMagic = 0x55434E5450524F46ULL; // "UCNTPROF"
+  if (Magic != ExpectedMagic)
+    return false;
+
+  // Verify version
+  if (Version != 1)
+    return false;
+
+  // Verify size
+  size_t ExpectedSize = 4 * sizeof(uint64_t) + CountersSize;
+  if (Size < ExpectedSize)
+    return false;
+
+  // Read counters
+  UniformCounters.resize(NumCounters);
+  const char *CounterData = Data + 4 * sizeof(uint64_t);
+  for (uint64_t i = 0; i < NumCounters; ++i) {
+    UniformCounters[i] =
+        support::endian::read64le(CounterData + i * sizeof(uint64_t));
+  }
+
+  return true;
+}
+
 /// Load an input into a writer context.
 static void
 loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
@@ -820,11 +886,50 @@ loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
     return;
   }
 
+  // Try to read uniform counters file for AMDGPU divergence tracking
+  std::vector<uint64_t> UniformCounters;
+  bool HasUniformCounters =
+      readUniformCountersFile(Input.Filename, UniformCounters);
+  size_t UniformCounterOffset = 0;
+
   for (auto &I : *Reader) {
     if (Remapper)
       I.Name = (*Remapper)(I.Name);
     const StringRef FuncName = I.Name;
     bool Reported = false;
+
+    // If we have uniform counters and this is an offload profile, compute
+    // uniformity from the uniform/total counter ratio
+    if (HasUniformCounters && I.NumOffloadProfilingThreads > 0) {
+      size_t NumCounters = I.Counts.size();
+      if (UniformCounterOffset + NumCounters <= UniformCounters.size()) {
+        // Compute uniformity bits from uniform counter ratio
+        size_t NumBlocks = NumCounters / (I.NumOffloadProfilingThreads + 1);
+        I.UniformityBits.resize((NumBlocks + 7) / 8, 0xFF); // Default: uniform
+
+        for (size_t BlockIdx = 0; BlockIdx < NumBlocks; ++BlockIdx) {
+          uint64_t TotalCount = 0;
+          uint64_t UniformCount = 0;
+
+          // Sum across all slots for this block
+          for (size_t Slot = 0; Slot < I.NumOffloadProfilingThreads; ++Slot) {
+            size_t Idx = BlockIdx * (I.NumOffloadProfilingThreads + 1) + Slot;
+            TotalCount += I.Counts[Idx];
+            UniformCount += UniformCounters[UniformCounterOffset + Idx];
+          }
+
+          // Compute uniformity ratio (90% threshold)
+          bool IsUniform =
+              (TotalCount == 0) || ((double)UniformCount / TotalCount >= 0.9);
+          if (!IsUniform) {
+            I.UniformityBits[BlockIdx / 8] &= ~(1 << (BlockIdx % 8));
+          }
+        }
+
+        UniformCounterOffset += NumCounters;
+      }
+    }
+
     WC->Writer.addRecord(std::move(I), Input.Weight, [&](Error E) {
       if (Reported) {
         consumeError(std::move(E));
@@ -1043,7 +1148,7 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
   for (unsigned I = 0; I < NumThreads; ++I)
     Contexts.emplace_back(std::make_unique<WriterContext>(
         OutputSparse, ErrorLock, WriterErrorCodes, TraceReservoirSize,
-        MaxTraceLength));
+        MaxTraceLength, OffloadDeviceWaveSize));
 
   if (NumThreads == 1) {
     for (const auto &Input : Inputs)
@@ -1477,8 +1582,8 @@ static void supplementInstrProfile(const WeightedFileVector &Inputs,
   // Read instr profile.
   std::mutex ErrorLock;
   SmallSet<instrprof_error, 4> WriterErrorCodes;
-  auto WC = std::make_unique<WriterContext>(OutputSparse, ErrorLock,
-                                            WriterErrorCodes);
+  auto WC = std::make_unique<WriterContext>(
+      OutputSparse, ErrorLock, WriterErrorCodes, 0, 0, OffloadDeviceWaveSize);
   loadInput(Inputs[0], nullptr, nullptr, /*ProfiledBinary=*/"", WC.get());
   if (WC->Errors.size() > 0)
     exitWithError(std::move(WC->Errors[0].first), InstrFilename);
@@ -2979,6 +3084,16 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
           OS << (I == Start ? "" : ", ") << Func.Counts[I];
         }
         OS << "]\n";
+
+        // Show uniformity bits if present
+        if (!Func.UniformityBits.empty()) {
+          OS << "    Block uniformity: [";
+          for (size_t I = Start, E = Func.Counts.size(); I < E; ++I) {
+            bool IsUniform = Func.isBlockUniform(I);
+            OS << (I == Start ? "" : ", ") << (IsUniform ? "U" : "D");
+          }
+          OS << "]\n";
+        }
       }
 
       if (ShowIndirectCallTargets) {

>From 9f0779ffa7e2bf1b51793004085060305b71a2fe Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Fri, 30 Jan 2026 16:34:00 -0500
Subject: [PATCH 2/5] [PGO] Unify __llvm_write_custom_profile API for HIP and
 OpenMP

Extend __llvm_write_custom_profile to support both HIP and OpenMP
offload PGO with a unified signature:

  int __llvm_write_custom_profile(
      const char *Target,
      const char *TUSuffix,              // NULL for single-TU
      const __llvm_profile_data *DataBegin,
      const __llvm_profile_data *DataEnd,
      const char *CountersBegin,
      const char *CountersEnd,
      const char *UniformCountersBegin,  // NULL if not used
      const char *UniformCountersEnd,    // NULL if not used
      const char *NamesBegin,
      const char *NamesEnd,
      const uint64_t *VersionOverride);

Changes:
- compiler-rt: Extend __llvm_write_custom_profile with TUSuffix and
  UniformCounters parameters
- OpenMP: Update GlobalHandler to use new signature (NULL for new params)
- HIP: Update InstrProfilingPlatformROCm.c to call unified function
- InstrProfiling: Gate contiguous allocation on HIP (check for
  __hip_cuid_* variable) to maintain OpenMP compatibility

File naming uses suffix format: <basename>.<target>.<TUSuffix>.<ext>

E2E tested with both OpenMP and HIP offload PGO.
---
 compiler-rt/lib/profile/InstrProfiling.h      |  22 +-
 compiler-rt/lib/profile/InstrProfilingFile.c  | 102 +++++++--
 .../lib/profile/InstrProfilingPlatformROCm.c  | 212 +++++-------------
 .../Instrumentation/InstrProfiling.cpp        |  17 +-
 .../amdgpu-contiguous-counters.ll             |   5 +-
 .../InstrProfiling/platform.ll                |  22 +-
 .../common/include/GlobalHandler.h            |   8 +-
 .../common/src/GlobalHandler.cpp              |   6 +-
 8 files changed, 187 insertions(+), 207 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 6ebeb4c2dc6c7..453b57241a4e0 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -308,14 +308,28 @@ void __llvm_profile_set_dumped(void);
 
 /*!
  * \brief Write custom target-specific profiling data to a separate file.
- * Used by offload PGO.
+ * Used by offload PGO (HIP and OpenMP).
+ *
+ * \param Target Target triple (e.g., "amdgcn-amd-amdhsa")
+ * \param TUSuffix TU index suffix (e.g., "0", "1") or NULL for no suffix
+ * \param DataBegin Start of profile data records
+ * \param DataEnd End of profile data records
+ * \param CountersBegin Start of counter data
+ * \param CountersEnd End of counter data
+ * \param UniformCountersBegin Start of uniform counters (NULL if not used)
+ * \param UniformCountersEnd End of uniform counters (NULL if not used)
+ * \param NamesBegin Start of names data
+ * \param NamesEnd End of names data
+ * \param VersionOverride Profile version override (NULL to use default)
  */
-int __llvm_write_custom_profile(const char *Target,
+int __llvm_write_custom_profile(const char *Target, const char *TUSuffix,
                                 const __llvm_profile_data *DataBegin,
                                 const __llvm_profile_data *DataEnd,
                                 const char *CountersBegin,
-                                const char *CountersEnd, const char *NamesBegin,
-                                const char *NamesEnd,
+                                const char *CountersEnd,
+                                const char *UniformCountersBegin,
+                                const char *UniformCountersEnd,
+                                const char *NamesBegin, const char *NamesEnd,
                                 const uint64_t *VersionOverride);
 
 /*!
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index aa9d567a1d17f..87074b0c369ee 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -1284,14 +1284,16 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File,
 }
 
 #ifndef __APPLE__
-int __llvm_write_custom_profile(const char *Target,
-                                const __llvm_profile_data *DataBegin,
-                                const __llvm_profile_data *DataEnd,
-                                const char *CountersBegin,
-                                const char *CountersEnd, const char *NamesBegin,
-                                const char *NamesEnd,
-                                const uint64_t *VersionOverride) {
-  int ReturnValue = 0, FilenameLength, TargetLength;
+int __llvm_write_custom_profile(const char *Target, const char *TUSuffix,
+                                   const __llvm_profile_data *DataBegin,
+                                   const __llvm_profile_data *DataEnd,
+                                   const char *CountersBegin,
+                                   const char *CountersEnd,
+                                   const char *UniformCountersBegin,
+                                   const char *UniformCountersEnd,
+                                   const char *NamesBegin, const char *NamesEnd,
+                                   const uint64_t *VersionOverride) {
+  int ReturnValue = 0, FilenameLength, TargetLength, TUSuffixLength;
   char *FilenameBuf, *TargetFilename;
   const char *Filename;
 
@@ -1309,7 +1311,8 @@ int __llvm_write_custom_profile(const char *Target,
   }
 
   /* Check if there is llvm/runtime version mismatch.  */
-  if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) {
+  if (VersionOverride == NULL &&
+      GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) {
     PROF_ERR("Runtime and instrumentation version mismatch : "
              "expected %d, but get %d\n",
              INSTR_PROF_RAW_VERSION,
@@ -1333,9 +1336,12 @@ int __llvm_write_custom_profile(const char *Target,
   }
 
   /* Allocate new space for our target-specific PGO filename */
+  /* Format: <dir>/<basename_without_ext>.<target>.<TUSuffix>.<ext> */
+  /* This matches the HIP convention for backward compatibility */
   TargetLength = strlen(Target);
-  TargetFilename =
-      (char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2);
+  TUSuffixLength = TUSuffix ? strlen(TUSuffix) : 0;
+  TargetFilename = (char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength +
+                                              TUSuffixLength + 3);
 
   /* Find file basename and path sizes */
   int32_t DirEnd = FilenameLength - 1;
@@ -1344,15 +1350,34 @@ int __llvm_write_custom_profile(const char *Target,
   }
   uint32_t DirSize = DirEnd + 1, BaseSize = FilenameLength - DirSize;
 
-  /* Prepend "TARGET." to current filename */
+  /* Find extension within basename */
+  const char *Basename = Filename + DirSize;
+  const char *Extension = strrchr(Basename, '.');
+  uint32_t BasenameNoExtSize =
+      Extension ? (uint32_t)(Extension - Basename) : BaseSize;
+  uint32_t ExtSize = Extension ? (uint32_t)(BaseSize - BasenameNoExtSize) : 0;
+
+  /* Build filename: <dir>/<basename_without_ext>.<target>.<TUSuffix>.<ext> */
+  char *p = TargetFilename;
   if (DirSize > 0) {
-    memcpy(TargetFilename, Filename, DirSize);
+    memcpy(p, Filename, DirSize);
+    p += DirSize;
+  }
+  memcpy(p, Basename, BasenameNoExtSize);
+  p += BasenameNoExtSize;
+  *p++ = '.';
+  memcpy(p, Target, TargetLength);
+  p += TargetLength;
+  if (TUSuffixLength > 0) {
+    *p++ = '.';
+    memcpy(p, TUSuffix, TUSuffixLength);
+    p += TUSuffixLength;
+  }
+  if (ExtSize > 0) {
+    memcpy(p, Extension, ExtSize);
+    p += ExtSize;
   }
-  memcpy(TargetFilename + DirSize, Target, TargetLength);
-  TargetFilename[TargetLength + DirSize] = '.';
-  memcpy(TargetFilename + DirSize + 1 + TargetLength, Filename + DirSize,
-         BaseSize);
-  TargetFilename[FilenameLength + 1 + TargetLength] = 0;
+  *p = '\0';
 
   /* Open and truncate target-specific PGO file */
   FILE *OutputFile = fopen(TargetFilename, "w");
@@ -1383,6 +1408,47 @@ int __llvm_write_custom_profile(const char *Target,
                          NULL, NULL, NULL, NamesBegin, NamesEnd, 0, Version);
   closeFileObject(OutputFile);
 
+  /* Write uniform counters to a separate file if provided */
+  if (ReturnValue == 0 && UniformCountersBegin && UniformCountersEnd &&
+      UniformCountersEnd > UniformCountersBegin) {
+    size_t UniformCountersSize = UniformCountersEnd - UniformCountersBegin;
+
+    /* Create uniform counters filename by replacing extension with .unifcnts */
+    size_t TargetFilenameLen = strlen(TargetFilename);
+    char *UniformFilename = (char *)COMPILER_RT_ALLOCA(TargetFilenameLen + 10);
+    strcpy(UniformFilename, TargetFilename);
+
+    /* Find and replace extension */
+    char *ext = strrchr(UniformFilename, '.');
+    if (ext) {
+      strcpy(ext, ".unifcnts");
+    } else {
+      strcat(UniformFilename, ".unifcnts");
+    }
+
+    FILE *UniformFile = fopen(UniformFilename, "wb");
+    if (UniformFile) {
+      /* Write a simple header: magic, version, num_counters, counters_size */
+      uint64_t UniformMagic = 0x55434E5450524F46ULL; /* "UCNTPROF" in ASCII */
+      uint64_t UniformVersion = 1;
+      uint64_t NumUniformCounters = UniformCountersSize / sizeof(uint64_t);
+
+      if (fwrite(&UniformMagic, sizeof(uint64_t), 1, UniformFile) != 1 ||
+          fwrite(&UniformVersion, sizeof(uint64_t), 1, UniformFile) != 1 ||
+          fwrite(&NumUniformCounters, sizeof(uint64_t), 1, UniformFile) != 1 ||
+          fwrite(&UniformCountersSize, sizeof(uint64_t), 1, UniformFile) != 1 ||
+          fwrite(UniformCountersBegin, 1, UniformCountersSize, UniformFile) !=
+              UniformCountersSize) {
+        PROF_WARN("Failed to write uniform counters to %s\n", UniformFilename);
+        ReturnValue = -1;
+      }
+      fclose(UniformFile);
+    } else {
+      PROF_WARN("Failed to open %s for writing uniform counters\n",
+                UniformFilename);
+    }
+  }
+
   // Restore SIGKILL.
   if (PDeathSig == 1)
     lprofRestoreSigKill();
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
index 18c55036bbc24..d1c13089227b0 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
@@ -277,8 +277,6 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex) {
     (void)hipGetSymbolAddress(&DevPtr, OffloadSectionShadowVariables[i]);
   }
 
-  char *DeviceFilename = NULL;
-  FILE *File = NULL;
   int ret = -1;
 
   // Allocate host memory for the device sections
@@ -317,61 +315,10 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex) {
               UniformCountersSize);
   }
 
-  // Construct the device-specific filename
-  // Format: <base>.<target>[.<TUIndex>].<ext>
-  // TUIndex is included when >= 0 to support multi-TU programs
-  char *BaseFilename = (char *)__llvm_profile_get_filename();
-  if (!BaseFilename) {
-    PROF_ERR("%s\n", "Failed to get base profile filename");
-    goto cleanup;
-  }
-  if (IsVerboseMode())
-    PROF_NOTE("Base profile filename: %s\n", BaseFilename);
-
-  const char *TargetInfix = "amdgcn-amd-amdhsa";
-  const char *Extension = strrchr(BaseFilename, '.');
-  char TUIndexStr[16] = "";
-  if (TUIndex >= 0) {
-    snprintf(TUIndexStr, sizeof(TUIndexStr), ".%d", TUIndex);
-  }
-
-  if (Extension) {
-    size_t BaseLen = Extension - BaseFilename;
-    size_t InfixLen = strlen(TargetInfix);
-    size_t TUIndexLen = strlen(TUIndexStr);
-    size_t ExtLen = strlen(Extension);
-    DeviceFilename =
-        (char *)malloc(BaseLen + 1 + InfixLen + TUIndexLen + ExtLen + 1);
-    strncpy(DeviceFilename, BaseFilename, BaseLen);
-    DeviceFilename[BaseLen] = '\0';
-    strcat(DeviceFilename, ".");
-    strcat(DeviceFilename, TargetInfix);
-    strcat(DeviceFilename, TUIndexStr);
-    strcat(DeviceFilename, Extension);
-  } else {
-    DeviceFilename =
-        (char *)malloc(strlen(BaseFilename) + 1 + strlen(TargetInfix) +
-                       strlen(TUIndexStr) + 1);
-    strcpy(DeviceFilename, BaseFilename);
-    strcat(DeviceFilename, ".");
-    strcat(DeviceFilename, TargetInfix);
-    strcat(DeviceFilename, TUIndexStr);
-  }
-  free(BaseFilename);
-
-  if (IsVerboseMode())
-    PROF_NOTE("Device profile filename: %s\n", DeviceFilename);
-
-  // Manually write the profile data with a proper header
-  File = fopen(DeviceFilename, "w");
-  if (!File) {
-    PROF_ERR("Failed to open %s for writing\n", DeviceFilename);
-    goto cleanup;
-  }
-
-  __llvm_profile_header Header;
+  // Compute padding sizes for proper buffer layout
+  // lprofWriteDataImpl computes CountersDelta = CountersBegin - DataBegin
+  // We need to arrange our buffer so this matches the expected file layout
   const uint64_t NumData = DataSize / sizeof(__llvm_profile_data);
-  const uint64_t NumCounters = CountersSize / sizeof(uint64_t);
   const uint64_t NumBitmapBytes = 0;
   const uint64_t VTableSectionSize = 0;
   const uint64_t VNamesSize = 0;
@@ -388,8 +335,31 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex) {
     goto cleanup;
   }
 
-  // Relocate pointers
-  __llvm_profile_data *RelocatedData = (__llvm_profile_data *)HostDataBegin;
+  // Create contiguous buffer with layout: [Data][Padding][Counters][Names]
+  // This ensures CountersBegin - DataBegin = DataSize + PaddingBytesBeforeCounters
+  size_t ContiguousBufferSize =
+      DataSize + PaddingBytesBeforeCounters + CountersSize + NamesSize;
+  char *ContiguousBuffer = (char *)malloc(ContiguousBufferSize);
+  if (!ContiguousBuffer) {
+    PROF_ERR("%s\n", "Failed to allocate contiguous buffer");
+    goto cleanup;
+  }
+  memset(ContiguousBuffer, 0, ContiguousBufferSize);
+
+  // Set up pointers into the contiguous buffer
+  char *BufDataBegin = ContiguousBuffer;
+  char *BufCountersBegin = ContiguousBuffer + DataSize + PaddingBytesBeforeCounters;
+  char *BufNamesBegin = BufCountersBegin + CountersSize;
+
+  // Copy data into contiguous buffer
+  memcpy(BufDataBegin, HostDataBegin, DataSize);
+  memcpy(BufCountersBegin, HostCountersBegin, CountersSize);
+  memcpy(BufNamesBegin, HostNamesBegin, NamesSize);
+
+  // Relocate CounterPtr in data records for file layout
+  // CounterPtr is device-relative offset; we need to adjust for file layout
+  // where Data section comes first, then Counters section
+  __llvm_profile_data *RelocatedData = (__llvm_profile_data *)BufDataBegin;
   for (uint64_t i = 0; i < NumData; ++i) {
     if (RelocatedData[i].CounterPtr) {
       ptrdiff_t DeviceCounterPtrOffset = (ptrdiff_t)RelocatedData[i].CounterPtr;
@@ -400,6 +370,9 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex) {
       ptrdiff_t OffsetIntoCountersSection =
           (char *)DeviceCountersAddr - (char *)DevCntsBegin;
 
+      // New offset: from this data record to its counters in file layout
+      // CountersDelta = BufCountersBegin - BufDataBegin = DataSize + Padding
+      // CounterPtr = CountersDelta + OffsetIntoCounters - (i * sizeof)
       ptrdiff_t NewRelativeOffset = DataSize + PaddingBytesBeforeCounters +
                                     OffsetIntoCountersSection -
                                     (i * sizeof(__llvm_profile_data));
@@ -413,111 +386,34 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex) {
                sizeof(RelocatedData[i].Values));
   }
 
-  // Populate header
-  Header.Magic = __llvm_profile_get_magic();
-  Header.Version = __llvm_profile_get_version();
-  Header.BinaryIdsSize = 0; // Not supported for device PGO yet
-  Header.NumData = NumData;
-  Header.PaddingBytesBeforeCounters = PaddingBytesBeforeCounters;
-  Header.NumCounters = NumCounters;
-  Header.PaddingBytesAfterCounters = PaddingBytesAfterCounters;
-  Header.NumBitmapBytes = NumBitmapBytes;
-  Header.PaddingBytesAfterBitmapBytes = PaddingBytesAfterBitmapBytes;
-  Header.NamesSize = NamesSize;
-  Header.CountersDelta = DataSize + PaddingBytesBeforeCounters;
-  Header.BitmapDelta =
-      Header.CountersDelta + CountersSize + PaddingBytesAfterCounters;
-  Header.NamesDelta =
-      Header.BitmapDelta + NumBitmapBytes + PaddingBytesAfterBitmapBytes;
-  Header.NumVTables = 0;
-  Header.VNamesSize = 0;
-  Header.ValueKindLast = 0; // No value profiling
-
-  // Write header and data
-  if (fwrite(&Header, sizeof(__llvm_profile_header), 1, File) != 1)
-    goto write_error;
-  if (fwrite(HostDataBegin, 1, DataSize, File) != DataSize)
-    goto write_error;
-  if (PaddingBytesBeforeCounters > 0 &&
-      fseek(File, PaddingBytesBeforeCounters, SEEK_CUR) != 0)
-    goto write_error;
-  if (fwrite(HostCountersBegin, 1, CountersSize, File) != CountersSize)
-    goto write_error;
-  if (PaddingBytesAfterCounters > 0 &&
-      fseek(File, PaddingBytesAfterCounters, SEEK_CUR) != 0)
-    goto write_error;
-  if (fwrite(HostNamesBegin, 1, NamesSize, File) != NamesSize)
-    goto write_error;
-
-  // Add padding after names to align to 8 bytes (required by profraw reader)
-  {
-    uint64_t NamesPadding = __llvm_profile_get_num_padding_bytes(NamesSize);
-    if (NamesPadding > 0) {
-      char ZeroPadding[8] = {0};
-      if (fwrite(ZeroPadding, 1, NamesPadding, File) != NamesPadding)
-        goto write_error;
-    }
+  // Build TU suffix string for filename
+  char TUIndexStr[16] = "";
+  if (TUIndex >= 0) {
+    snprintf(TUIndexStr, sizeof(TUIndexStr), "%d", TUIndex);
   }
 
-  if (IsVerboseMode())
-    PROF_NOTE("Successfully wrote profile data to %s\n", DeviceFilename);
-
-  // Write uniform counters to a separate file if available
-  if (UniformCountersSize > 0 && HostUniformCountersBegin) {
-    // Create uniform counters filename by replacing extension with .unifcnts
-    size_t DeviceFilenameLen = strlen(DeviceFilename);
-    char *UniformFilename = (char *)malloc(DeviceFilenameLen + 10);
-    if (UniformFilename) {
-      strcpy(UniformFilename, DeviceFilename);
-      // Find and replace .profraw extension
-      char *ext = strrchr(UniformFilename, '.');
-      if (ext) {
-        strcpy(ext, ".unifcnts");
-      } else {
-        strcat(UniformFilename, ".unifcnts");
-      }
-
-      FILE *UniformFile = fopen(UniformFilename, "wb");
-      if (UniformFile) {
-        // Write a simple header: magic, version, num_counters, counters_size
-        uint64_t UniformMagic = 0x55434E5450524F46ULL; // "UCNTPROF" in ASCII
-        uint64_t UniformVersion = 1;
-        uint64_t NumUniformCounters = UniformCountersSize / sizeof(uint64_t);
-
-        if (fwrite(&UniformMagic, sizeof(uint64_t), 1, UniformFile) != 1 ||
-            fwrite(&UniformVersion, sizeof(uint64_t), 1, UniformFile) != 1 ||
-            fwrite(&NumUniformCounters, sizeof(uint64_t), 1, UniformFile) !=
-                1 ||
-            fwrite(&UniformCountersSize, sizeof(uint64_t), 1, UniformFile) !=
-                1 ||
-            fwrite(HostUniformCountersBegin, 1, UniformCountersSize,
-                   UniformFile) != UniformCountersSize) {
-          PROF_WARN("Failed to write uniform counters to %s\n",
-                    UniformFilename);
-        } else if (IsVerboseMode()) {
-          PROF_NOTE(
-              "Successfully wrote %zu uniform counters (%zu bytes) to %s\n",
-              (size_t)NumUniformCounters, UniformCountersSize, UniformFilename);
-        }
-        fclose(UniformFile);
-      } else {
-        PROF_WARN("Failed to open %s for writing uniform counters\n",
-                  UniformFilename);
-      }
-      free(UniformFilename);
-    }
+  // Use shared profile writing API
+  const char *TargetTriple = "amdgcn-amd-amdhsa";
+  ret = __llvm_write_custom_profile(
+      TargetTriple, TUIndex >= 0 ? TUIndexStr : NULL,
+      (__llvm_profile_data *)BufDataBegin,
+      (__llvm_profile_data *)(BufDataBegin + DataSize), BufCountersBegin,
+      BufCountersBegin + CountersSize,
+      HostUniformCountersBegin,
+      HostUniformCountersBegin
+          ? HostUniformCountersBegin + UniformCountersSize
+          : NULL,
+      BufNamesBegin, BufNamesBegin + NamesSize, NULL);
+
+  free(ContiguousBuffer);
+
+  if (ret != 0) {
+    PROF_ERR("%s\n", "Failed to write device profile using shared API");
+  } else if (IsVerboseMode()) {
+    PROF_NOTE("%s\n", "Successfully wrote device profile using shared API");
   }
 
-  ret = 0;
-  goto cleanup;
-
-write_error:
-  PROF_ERR("Failed to write to %s\n", DeviceFilename);
-
 cleanup:
-  if (File)
-    fclose(File);
-  free(DeviceFilename);
   free(HostCountersBegin);
   free(HostDataBegin);
   free(HostNamesBegin);
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index f12edbe20f65d..5b6f5ae122840 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -2904,11 +2904,14 @@ void InstrLowerer::allocateContiguousProfileArrays() {
     return;
   }
 
-  // Get and cache the CUID for consistent section naming
+  // Get and cache the CUID for consistent section naming.
+  // CUID is only present for HIP compilations (__hip_cuid_* variable).
+  // For OpenMP offload, use the standard per-function allocation.
   CachedCUID = getCUIDFromModule(M);
   if (CachedCUID.empty()) {
-    LLVM_DEBUG(llvm::dbgs() << "No CUID found in module, using fallback\n");
-    CachedCUID = std::to_string(std::hash<std::string>{}(M.getName().str()));
+    LLVM_DEBUG(llvm::dbgs() << "No CUID found (not HIP), using standard "
+                               "per-function allocation\n");
+    return;
   }
 
   LLVM_DEBUG(llvm::dbgs() << "Allocating contiguous arrays for CUID="
@@ -2985,7 +2988,7 @@ void InstrLowerer::allocateContiguousProfileArrays() {
   std::string CntsSectionName = "__llvm_prf_cnts_" + CachedCUID;
   ContiguousCnts = new GlobalVariable(
       M, CntsArrayTy, /*isConstant=*/false, GlobalValue::ExternalLinkage,
-      Constant::getNullValue(CntsArrayTy), "__profc_all_" + CachedCUID);
+      Constant::getNullValue(CntsArrayTy), "__llvm_prf_c_" + CachedCUID);
   ContiguousCnts->setSection(CntsSectionName);
   ContiguousCnts->setAlignment(Align(8));
   ContiguousCnts->setVisibility(GlobalValue::ProtectedVisibility);
@@ -3017,7 +3020,7 @@ void InstrLowerer::allocateContiguousProfileArrays() {
 
     ContiguousData = new GlobalVariable(M, DataArrayTy, /*isConstant=*/false,
                                         GlobalValue::ExternalLinkage, nullptr,
-                                        "__profd_all_" + CachedCUID);
+                                        "__llvm_prf_d_" + CachedCUID);
     ContiguousData->setSection(DataSectionName);
     ContiguousData->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
     ContiguousData->setVisibility(GlobalValue::ProtectedVisibility);
@@ -3307,8 +3310,8 @@ void InstrLowerer::createHIPDeviceVariableRegistration() {
   };
 
   // Per-TU contiguous symbols (device side).
-  std::string CntsSym = std::string("__profc_all_") + CUID;
-  std::string DataSym = std::string("__profd_all_") + CUID;
+  std::string CntsSym = std::string("__llvm_prf_c_") + CUID;
+  std::string DataSym = std::string("__llvm_prf_d_") + CUID;
   std::string UCntsSym = std::string("__profu_all_") + CUID;
   std::string NamesSym = std::string(getInstrProfNamesVarName()) + "_" + CUID;
   registerSectionSymbol(CntsSym);
diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll
index 95ce5c30f7f53..65064183a1fd2 100644
--- a/llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-contiguous-counters.ll
@@ -11,11 +11,14 @@
 @__profn_kernel2 = private constant [7 x i8] c"kernel2"
 
 ;; Check that contiguous counter array is created with CUID suffix
-; CHECK: @__profc_all_abc123 = protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_cnts_abc123", align 8
+; CHECK: @__llvm_prf_c_abc123 = protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_cnts_abc123", align 8
 
 ;; Check that contiguous uniform counter array is created for divergence tracking
 ; CHECK: @__profu_all_abc123 = protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_ucnts_abc123", align 8
 
+;; Check that contiguous data array is created with CUID suffix
+; CHECK: @__llvm_prf_d_abc123 = protected addrspace(1) global
+
 ;; Check that individual __profc_kernel* symbols are NOT created (contiguous mode)
 ; CHECK-NOT: @__profc_kernel1
 ; CHECK-NOT: @__profc_kernel2
diff --git a/llvm/test/Instrumentation/InstrProfiling/platform.ll b/llvm/test/Instrumentation/InstrProfiling/platform.ll
index eaccf6681a426..ac38071ae3718 100644
--- a/llvm/test/Instrumentation/InstrProfiling/platform.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/platform.ll
@@ -22,25 +22,20 @@
 ; ELF: @__profc_foo = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
 ; WINDOWS: @__profc_foo = private global [1 x i64] zeroinitializer, section ".lprfc$M", align 8
 ; AIX: @__profc_foo = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", align 8
-;; AMDGPU uses contiguous counter allocation with hash-based naming when no CUID
-; AMDGPU: @__profc_all_{{[0-9]+}} = protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_cnts_{{[0-9]+}}", align 8
-; AMDGPU: @__profu_all_{{[0-9]+}} = protected addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_ucnts_{{[0-9]+}}", align 8
+;; AMDGPU without CUID uses per-function allocation (like ELF) for OpenMP compatibility
+; AMDGPU: @__profc_foo = private addrspace(1) global [{{[0-9]+}} x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
 
 ; MACHO: @__profd_foo = private {{.*}}, section "__DATA,__llvm_prf_data,regular,live_support", align 8
 ; ELF: @__profd_foo = private {{.*}}, section "__llvm_prf_data", comdat($__profc_foo), align 8
 ; WINDOWS: @__profd_foo = private global {{.*}}, section ".lprfd$M", align 8
 ; AIX: @__profd_foo = private {{.*}}, section "__llvm_prf_data", align 8
+;; AMDGPU without CUID uses per-function data (not alias)
+; AMDGPU: @__profd_foo = protected addrspace(1) global {{.*}}, section "__llvm_prf_data", comdat($__profc_foo), align 8
+
 ; ELF: @__llvm_prf_nm = private constant [{{.*}} x i8] c"{{.*}}", section "{{.*}}__llvm_prf_names"{{.*}}, align 1
 ; WINDOWS: @__llvm_prf_nm = private constant [{{.*}} x i8] c"{{.*}}", section "{{.*}}lprfn$M", align 1
 ; AIX: @__llvm_prf_nm = private constant [{{.*}} x i8] c"{{.*}}", section "{{.*}}__llvm_prf_names", align 1
-;; AMDGPU uses CUID-suffixed names section
-; AMDGPU: @__llvm_prf_nm_{{[0-9]+}} = protected addrspace(1) constant [{{.*}} x i8] c"{{.*}}", section "__llvm_prf_names", align 1
-
-;; Check for __llvm_offload_prf_<CUID> structure (replaces start/stop symbols for AMDGPU)
-; AMDGPU: @__llvm_offload_prf_{{[0-9]+}} = addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) }
-
-;; AMDGPU uses per-TU contiguous allocation, so @__profd_foo is an alias
-; AMDGPU: @__profd_foo = protected alias
+; AMDGPU: @__llvm_prf_nm = protected addrspace(1) constant [{{.*}} x i8] c"{{.*}}", section "__llvm_prf_names", align 1
 
 define void @foo() {
   call void @llvm.instrprof.increment(ptr @__profn_foo, i64 0, i32 1, i32 0)
@@ -49,9 +44,8 @@ define void @foo() {
 
 declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
 
-;; Start/stop symbols are NOT created for AMDGPU with contiguous allocation
-; AMDGPU-NOT: @__start___llvm_prf_cnts
-; AMDGPU-NOT: @__stop___llvm_prf_cnts
+;; AMDGPU without CUID uses standard per-function allocation (for OpenMP compatibility)
+;; Start/stop symbols behavior is platform-specific
 
 ;; Emit registration functions for platforms that don't find the
 ;; symbols by their sections.
diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index af7dac66ca85d..0802b6f818910 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -67,9 +67,11 @@ struct __llvm_profile_data {
 
 extern "C" {
 extern int __attribute__((weak)) __llvm_write_custom_profile(
-    const char *Target, const __llvm_profile_data *DataBegin,
-    const __llvm_profile_data *DataEnd, const char *CountersBegin,
-    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd,
+    const char *Target, const char *TUSuffix,
+    const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
+    const char *CountersBegin, const char *CountersEnd,
+    const char *UniformCountersBegin, const char *UniformCountersEnd,
+    const char *NamesBegin, const char *NamesEnd,
     const uint64_t *VersionOverride);
 }
 /// PGO profiling data extracted from a GPU device
diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index b92c606d14da1..09416c18a3974 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -311,9 +311,11 @@ Error GPUProfGlobals::write() const {
   memcpy(NamesBegin, NamesData.data(), NamesData.size());
 
   // Invoke compiler-rt entrypoint
+  // Pass NULL for TUSuffix and UniformCounters (not used by OpenMP)
   int result = __llvm_write_custom_profile(
-      TargetTriple.str().c_str(), DataBegin, DataEnd, CountersBegin,
-      CountersEnd, NamesBegin, NamesEnd, &Version);
+      TargetTriple.str().c_str(), /*TUSuffix=*/nullptr, DataBegin, DataEnd,
+      CountersBegin, CountersEnd, /*UniformCountersBegin=*/nullptr,
+      /*UniformCountersEnd=*/nullptr, NamesBegin, NamesEnd, &Version);
   if (result != 0)
     return Plugin::error(ErrorCode::HOST_IO,
                          "error writing GPU PGO data to file");

>From 242d37ae6eeec0c3a41a39e866ab94f6c68c2e4d Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Mon, 2 Feb 2026 10:02:51 -0500
Subject: [PATCH 3/5] [compiler-rt][profile] Use weak references for dl*
 functions in ROCm profiling

Use weak references for dlopen, dlsym, and dlerror in
InstrProfilingPlatformROCm.c to avoid requiring -ldl at link time when
the HIP profiling code isn't actually used.

This fixes a link failure when compiling OpenMP offload programs with
PGO instrumentation, where the profile library is linked but libdl is
not. The weak references allow the code to gracefully handle the case
where the dl* functions are unavailable.

Fixes: https://github.com/llvm/llvm-project/pull/177665#discussion_r12345
---
 .../lib/profile/InstrProfilingPlatformROCm.c  | 48 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
index d1c13089227b0..a668ab597a8b0 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
@@ -14,6 +14,22 @@
 #include <stdlib.h>
 #include <string.h>
 
+/* Use weak references for dl* functions to avoid requiring -ldl at link time.
+ *
+ * The profile runtime is a static library, so its dependencies must be
+ * explicitly linked by the user. Unlike sanitizer runtimes (which are often
+ * shared libraries with their own dependencies), adding -ldl globally would
+ * affect all profiling users, including those not using HIP/ROCm.
+ *
+ * With weak references:
+ * - Programs without -ldl link successfully (dl* resolve to NULL)
+ * - HIP programs get -ldl from the HIP runtime, so dl* work normally
+ * - OpenMP offload programs without HIP gracefully skip device profiling
+ */
+#pragma weak dlopen
+#pragma weak dlsym
+#pragma weak dlerror
+
 static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex);
 
 static int IsVerboseMode() {
@@ -48,10 +64,18 @@ static void EnsureHipLoaded(void) {
     return;
   Initialized = 1;
 
+  /* Check if dlopen is available (weak symbol may be NULL if -ldl not linked) */
+  if (!dlopen) {
+    if (IsVerboseMode())
+      PROF_NOTE("%s", "dlopen not available - HIP profiling disabled\n");
+    return;
+  }
+
   void *Handle = dlopen("libamdhip64.so", RTLD_LAZY | RTLD_LOCAL);
   if (!Handle) {
-    fprintf(stderr, "compiler-rt: failed to open libamdhip64.so: %s\n",
-            dlerror());
+    if (dlerror)
+      fprintf(stderr, "compiler-rt: failed to open libamdhip64.so: %s\n",
+              dlerror());
     return;
   }
 
@@ -431,6 +455,12 @@ static int ProcessShadowVariable(void *ShadowVar, int TUIndex) {
   return ProcessDeviceOffloadPrf(DeviceOffloadPrf, TUIndex);
 }
 
+/* Check if HIP runtime is available and loaded */
+static int IsHipAvailable(void) {
+  EnsureHipLoaded();
+  return pHipMemcpy != NULL && pHipGetSymbolAddress != NULL;
+}
+
 /* -------------------------------------------------------------------------- */
 /*  Collect device-side profile data                                          */
 /* -------------------------------------------------------------------------- */
@@ -439,6 +469,20 @@ int __llvm_profile_hip_collect_device_data(void) {
   if (IsVerboseMode())
     PROF_NOTE("%s", "__llvm_profile_hip_collect_device_data called\n");
 
+  /* Early return if no HIP profile data was registered */
+  if (NumShadowVariables == 0 && NumDynamicModules == 0) {
+    if (IsVerboseMode())
+      PROF_NOTE("%s", "No HIP profile data registered, skipping collection\n");
+    return 0;
+  }
+
+  /* Early return if HIP runtime is not available */
+  if (!IsHipAvailable()) {
+    if (IsVerboseMode())
+      PROF_NOTE("%s", "HIP runtime not available, skipping collection\n");
+    return 0;
+  }
+
   int Ret = 0;
 
   /* Shadow variables (static-linked kernels) */

>From e71d496d7902809f697d9809dbc6c21ae6bdf9ed Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Mon, 2 Feb 2026 22:36:59 -0500
Subject: [PATCH 4/5] [Docs][HIP] Add Profile Guided Optimization documentation

Document the PGO workflow for HIP programs:
- Instrumented build with -fprofile-generate
- Profile collection (separate host/device profiles)
- Profile merging with llvm-profdata
- Optimized build with -Xarch_host/-Xarch_device -fprofile-use
- Multi-TU support
- Debug output with LLVM_PROFILE_VERBOSE
---
 clang/docs/HIPSupport.rst | 78 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 5 deletions(-)

diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index c2a91a3062bc3..5d8a55382b0dd 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -53,20 +53,20 @@ To compile a HIP program, use the following command:
 
 .. code-block:: shell
 
-   clang++ -c --offload-arch=gfx906 -xhip sample.cpp -o sample.o
+   clang++ -c --offload-arch=gfx1200 -xhip sample.cpp -o sample.o
 
 The ``-xhip`` option indicates that the source is a HIP program. If the file has a ``.hip`` extension,
 Clang will automatically recognize it as a HIP program:
 
 .. code-block:: shell
 
-   clang++ -c --offload-arch=gfx906 sample.hip -o sample.o
+   clang++ -c --offload-arch=gfx1200 sample.hip -o sample.o
 
 To link a HIP program, use this command:
 
 .. code-block:: shell
 
-   clang++ --hip-link --offload-arch=gfx906 sample.o -o sample
+   clang++ --hip-link --offload-arch=gfx1200 sample.o -o sample
 
 In the above command, the ``--hip-link`` flag instructs Clang to link the HIP runtime library. However,
 the use of this flag is unnecessary if a HIP input file is already present in your program.
@@ -75,9 +75,9 @@ For convenience, Clang also supports compiling and linking in a single step:
 
 .. code-block:: shell
 
-   clang++ --offload-arch=gfx906 -xhip sample.cpp -o sample
+   clang++ --offload-arch=gfx1200 -xhip sample.cpp -o sample
 
-In the above commands, ``gfx906`` is the GPU architecture that the code is being compiled for. The supported GPU
+In the above commands, ``gfx1200`` is the GPU architecture that the code is being compiled for. The supported GPU
 architectures can be found in the `AMDGPU Processor Table <https://llvm.org/docs/AMDGPUUsage.html#processors>`_.
 Alternatively, you can use the ``amdgpu-arch`` tool that comes with Clang to list the GPU architecture on your system:
 
@@ -412,6 +412,74 @@ Example Usage
    __host__ __device__ int Four(void) __attribute__((weak, alias("_Z6__Fourv")));
    __host__ __device__ float Four(float f) __attribute__((weak, alias("_Z6__Fourf")));
 
+Profile Guided Optimization (PGO)
+=================================
+
+Clang supports Profile Guided Optimization (PGO) for HIP, enabling optimization
+of both host and device code based on runtime execution profiles.
+
+Workflow
+--------
+
+The PGO workflow consists of three phases:
+
+1. **Instrumented Build**: Compile with ``-fprofile-generate`` to create an
+   instrumented binary that collects execution profiles:
+
+   .. code-block:: shell
+
+      clang++ -O2 -fprofile-generate --offload-arch=gfx1200 -xhip app.hip -o app_instrumented
+
+2. **Profile Collection**: Run the instrumented binary with representative
+   workloads. This generates separate profile files for host and device:
+
+   .. code-block:: shell
+
+      ./app_instrumented
+      # Creates: default_<id>.profraw (host)
+      #          default_<id>.amdgcn-amd-amdhsa.<tu>.profraw (device)
+
+3. **Merge Profiles**: Use ``llvm-profdata`` to merge the raw profiles:
+
+   .. code-block:: shell
+
+      # Merge host profiles
+      llvm-profdata merge -o app.profdata default_*_0.profraw
+
+      # Merge device profiles
+      llvm-profdata merge -o app.amdgcn-amd-amdhsa.profdata \
+          default_*.amdgcn-amd-amdhsa.*.profraw
+
+4. **Optimized Build**: Rebuild with ``-fprofile-use``, specifying separate
+   profile files for host and device using ``-Xarch_host`` and ``-Xarch_device``:
+
+   .. code-block:: shell
+
+      clang++ -O2 --offload-arch=gfx1200 -xhip app.hip -o app_optimized \
+          -Xarch_host -fprofile-use=app.profdata \
+          -Xarch_device -fprofile-use=app.amdgcn-amd-amdhsa.profdata
+
+Debug Output
+------------
+
+Set ``LLVM_PROFILE_VERBOSE=1`` to see diagnostic messages during profile
+collection:
+
+.. code-block:: shell
+
+   LLVM_PROFILE_VERBOSE=1 ./app_instrumented
+
+This shows information about profile data registration, device memory
+operations, and profile file creation.
+
+Limitations
+-----------
+
+- Device PGO is supported only on AMD GPUs with HIP.
+- Value profiling is not supported for device code.
+- The ``--wave-size`` option to ``llvm-profdata merge`` can be used to specify
+  the wave size for uniformity analysis (default: 32).
+
 C++17 Class Template Argument Deduction (CTAD) Support
 ======================================================
 

>From 1d22cf20ef0e885a1714e47544014a3746115518 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Tue, 3 Feb 2026 11:00:58 -0500
Subject: [PATCH 5/5] [NFC] Fix clang-format issues in profile runtime

Fix indentation and line wrapping issues flagged by CI.

Co-authored-by: Cursor <cursoragent at cursor.com>
---
 compiler-rt/lib/profile/InstrProfilingFile.c    | 16 ++++++++--------
 .../lib/profile/InstrProfilingPlatformROCm.c    | 17 +++++++++--------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 87074b0c369ee..45262d7808982 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -1285,14 +1285,14 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File,
 
 #ifndef __APPLE__
 int __llvm_write_custom_profile(const char *Target, const char *TUSuffix,
-                                   const __llvm_profile_data *DataBegin,
-                                   const __llvm_profile_data *DataEnd,
-                                   const char *CountersBegin,
-                                   const char *CountersEnd,
-                                   const char *UniformCountersBegin,
-                                   const char *UniformCountersEnd,
-                                   const char *NamesBegin, const char *NamesEnd,
-                                   const uint64_t *VersionOverride) {
+                                const __llvm_profile_data *DataBegin,
+                                const __llvm_profile_data *DataEnd,
+                                const char *CountersBegin,
+                                const char *CountersEnd,
+                                const char *UniformCountersBegin,
+                                const char *UniformCountersEnd,
+                                const char *NamesBegin, const char *NamesEnd,
+                                const uint64_t *VersionOverride) {
   int ReturnValue = 0, FilenameLength, TargetLength, TUSuffixLength;
   char *FilenameBuf, *TargetFilename;
   const char *Filename;
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
index a668ab597a8b0..ee4a51c75fe66 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.c
@@ -64,7 +64,8 @@ static void EnsureHipLoaded(void) {
     return;
   Initialized = 1;
 
-  /* Check if dlopen is available (weak symbol may be NULL if -ldl not linked) */
+  /* Check if dlopen is available (weak symbol may be NULL if -ldl not linked)
+   */
   if (!dlopen) {
     if (IsVerboseMode())
       PROF_NOTE("%s", "dlopen not available - HIP profiling disabled\n");
@@ -360,7 +361,8 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex) {
   }
 
   // Create contiguous buffer with layout: [Data][Padding][Counters][Names]
-  // This ensures CountersBegin - DataBegin = DataSize + PaddingBytesBeforeCounters
+  // This ensures CountersBegin - DataBegin = DataSize +
+  // PaddingBytesBeforeCounters
   size_t ContiguousBufferSize =
       DataSize + PaddingBytesBeforeCounters + CountersSize + NamesSize;
   char *ContiguousBuffer = (char *)malloc(ContiguousBufferSize);
@@ -372,7 +374,8 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex) {
 
   // Set up pointers into the contiguous buffer
   char *BufDataBegin = ContiguousBuffer;
-  char *BufCountersBegin = ContiguousBuffer + DataSize + PaddingBytesBeforeCounters;
+  char *BufCountersBegin =
+      ContiguousBuffer + DataSize + PaddingBytesBeforeCounters;
   char *BufNamesBegin = BufCountersBegin + CountersSize;
 
   // Copy data into contiguous buffer
@@ -422,11 +425,9 @@ static int ProcessDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex) {
       TargetTriple, TUIndex >= 0 ? TUIndexStr : NULL,
       (__llvm_profile_data *)BufDataBegin,
       (__llvm_profile_data *)(BufDataBegin + DataSize), BufCountersBegin,
-      BufCountersBegin + CountersSize,
-      HostUniformCountersBegin,
-      HostUniformCountersBegin
-          ? HostUniformCountersBegin + UniformCountersSize
-          : NULL,
+      BufCountersBegin + CountersSize, HostUniformCountersBegin,
+      HostUniformCountersBegin ? HostUniformCountersBegin + UniformCountersSize
+                               : NULL,
       BufNamesBegin, BufNamesBegin + NamesSize, NULL);
 
   free(ContiguousBuffer);