[clang] [compiler-rt] [llvm] [PGO][Offload] Allow PGO flags to be used on GPU targets (PR #94268)

Ethan Luis McDonough via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 18 22:39:35 PDT 2025


https://github.com/EthanLuisMcDonough updated https://github.com/llvm/llvm-project/pull/94268

>From 3a2047c273d948d035b50eb486b772d5b3bdc401 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Tue, 18 Mar 2025 16:20:14 -0500
Subject: [PATCH 1/3] [PGO][Offload] Allow PGO flags to be used on GPU targets

---
 clang/lib/Driver/ToolChains/Clang.cpp         |  6 +-
 clang/test/Driver/cuda-no-pgo-or-coverage.cu  | 33 --------
 compiler-rt/include/profile/InstrProfData.inc |  2 +-
 compiler-rt/lib/profile/InstrProfiling.h      |  3 +-
 .../lib/profile/InstrProfilingBuffer.c        |  3 +-
 compiler-rt/lib/profile/InstrProfilingFile.c  | 22 +++--
 .../lib/profile/InstrProfilingInternal.h      |  3 +-
 .../lib/profile/InstrProfilingWriter.c        | 20 ++---
 .../llvm/ProfileData/InstrProfData.inc        |  2 +-
 .../Instrumentation/PGOInstrumentation.cpp    |  5 +-
 .../llvm-profdata/binary-ids-padding.test     |  2 +-
 ...alformed-not-space-for-another-header.test |  2 +-
 .../malformed-num-counters-zero.test          |  2 +-
 .../malformed-ptr-to-counter-array.test       |  2 +-
 .../common/include/GlobalHandler.h            |  6 +-
 .../common/src/GlobalHandler.cpp              | 18 +++-
 offload/test/offloading/gpupgo/pgo1.c         | 84 +++++++++++++++++++
 offload/test/offloading/gpupgo/pgo2.c         | 76 +++++++++++++++++
 offload/test/offloading/pgo1.c                | 66 ---------------
 19 files changed, 220 insertions(+), 137 deletions(-)
 delete mode 100644 clang/test/Driver/cuda-no-pgo-or-coverage.cu
 create mode 100644 offload/test/offloading/gpupgo/pgo1.c
 create mode 100644 offload/test/offloading/gpupgo/pgo2.c
 delete mode 100644 offload/test/offloading/pgo1.c

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 1012128085c7a..e0f1206496486 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6387,11 +6387,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   Args.AddLastArg(CmdArgs, options::OPT_fconvergent_functions,
                   options::OPT_fno_convergent_functions);
 
-  // NVPTX/AMDGCN doesn't support PGO or coverage. There's no runtime support
-  // for sampling, overhead of call arc collection is way too high and there's
-  // no way to collect the output.
-  if (!Triple.isNVPTX() && !Triple.isAMDGCN())
-    addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
+  addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
 
   Args.AddLastArg(CmdArgs, options::OPT_fclang_abi_compat_EQ);
 
diff --git a/clang/test/Driver/cuda-no-pgo-or-coverage.cu b/clang/test/Driver/cuda-no-pgo-or-coverage.cu
deleted file mode 100644
index b84587e1e182b..0000000000000
--- a/clang/test/Driver/cuda-no-pgo-or-coverage.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-// Check that profiling/coverage arguments doen't get passed down to device-side
-// compilation.
-//
-//
-// XRUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// XRUN:   -fprofile-generate %s 2>&1 | \
-// XRUN:   FileCheck --check-prefixes=CHECK,PROF %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN:   -fprofile-instr-generate %s 2>&1 | \
-// RUN:   FileCheck --check-prefixes=CHECK,PROF %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN:   -coverage %s 2>&1 | \
-// RUN:   FileCheck --check-prefixes=CHECK,GCOV %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
-// RUN:   -ftest-coverage %s 2>&1 | \
-// RUN:   FileCheck --check-prefixes=CHECK,GCOV %s
-//
-// RUN: not %clang -### --target=x86_64-linux-gnu -c --cuda-gpu-arch=sm_20   \
-// RUN:   -fprofile-instr-generate -fcoverage-mapping %s 2>&1 | \
-// RUN:   FileCheck --check-prefixes=CHECK,PROF %s
-//
-//
-// CHECK-NOT: error: unsupported option '-fprofile
-// CHECK-NOT: error: invalid argument
-// CHECK-DAG: "-fcuda-is-device"
-// CHECK-NOT: "-f{{[^"/]*coverage.*}}"
-// CHECK-NOT: "-fprofile{{[^"]*}}"
-// CHECK: "-triple" "x86_64-unknown-linux-gnu"
-// PROF:      "-fprofile{{.*}}"
-// GCOV:      "-coverage-notes-file=
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 2cdfea9a579a4..d51b58386f168 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -152,7 +152,7 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \
 #define INSTR_PROF_DATA_DEFINED
 #endif
 INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic())
-INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version())
+INSTR_PROF_RAW_HEADER(uint64_t, Version, Version)
 INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 INSTR_PROF_RAW_HEADER(uint64_t, NumData, NumData)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters)
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 77c8d6c79322d..a90558fdcfbbf 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -310,7 +310,8 @@ int __llvm_write_custom_profile(const char *Target,
                                 const __llvm_profile_data *DataEnd,
                                 const char *CountersBegin,
                                 const char *CountersEnd, const char *NamesBegin,
-                                const char *NamesEnd);
+                                const char *NamesEnd,
+                                const uint64_t *VersionOverride);
 
 /*!
  * This variable is defined in InstrProfilingRuntime.cpp as a hidden
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index 1c451d7ec7563..b406e8db74f3f 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -252,5 +252,6 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal(
       &BufferWriter, DataBegin, DataEnd, CountersBegin, CountersEnd,
       BitmapBegin, BitmapEnd, /*VPDataReader=*/0, NamesBegin, NamesEnd,
       /*VTableBegin=*/NULL, /*VTableEnd=*/NULL, /*VNamesBegin=*/NULL,
-      /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0);
+      /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0,
+      __llvm_profile_get_version());
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 4667c02892505..19467429cf4c3 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -1273,10 +1273,13 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File,
   return 0;
 }
 
-COMPILER_RT_USED int __llvm_write_custom_profile(
-    const char *Target, const __llvm_profile_data *DataBegin,
-    const __llvm_profile_data *DataEnd, const char *CountersBegin,
-    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd) {
+int __llvm_write_custom_profile(const char *Target,
+                                const __llvm_profile_data *DataBegin,
+                                const __llvm_profile_data *DataEnd,
+                                const char *CountersBegin,
+                                const char *CountersEnd, const char *NamesBegin,
+                                const char *NamesEnd,
+                                const uint64_t *VersionOverride) {
   int ReturnValue = 0, FilenameLength, TargetLength;
   char *FilenameBuf, *TargetFilename;
   const char *Filename;
@@ -1358,10 +1361,15 @@ COMPILER_RT_USED int __llvm_write_custom_profile(
   ProfDataWriter fileWriter;
   initFileWriter(&fileWriter, OutputFile);
 
+  uint64_t Version = __llvm_profile_get_version();
+  if (VersionOverride)
+    Version = *VersionOverride;
+
   /* Write custom data to the file */
-  ReturnValue = lprofWriteDataImpl(
-      &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL,
-      lprofGetVPDataReader(), NULL, NULL, NULL, NULL, NamesBegin, NamesEnd, 0);
+  ReturnValue =
+      lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, CountersBegin,
+                         CountersEnd, NULL, NULL, lprofGetVPDataReader(), NULL,
+                         NULL, NULL, NULL, NamesBegin, NamesEnd, 0, Version);
   closeFileObject(OutputFile);
 
   // Restore SIGKILL.
diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h
index b100343ca04f9..03df71828b91d 100644
--- a/compiler-rt/lib/profile/InstrProfilingInternal.h
+++ b/compiler-rt/lib/profile/InstrProfilingInternal.h
@@ -160,7 +160,8 @@ int lprofWriteDataImpl(ProfDataWriter *Writer,
                        VPDataReaderType *VPDataReader, const char *NamesBegin,
                        const char *NamesEnd, const VTableProfData *VTableBegin,
                        const VTableProfData *VTableEnd, const char *VNamesBegin,
-                       const char *VNamesEnd, int SkipNameDataWrite);
+                       const char *VNamesEnd, int SkipNameDataWrite,
+                       uint64_t Version);
 
 /* Merge value profile data pointed to by SrcValueProfData into
  * in-memory profile counters pointed by to DstData.  */
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 8816a71155511..bcd88b30d050d 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -254,21 +254,21 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer,
   const VTableProfData *VTableEnd = __llvm_profile_end_vtables();
   const char *VNamesBegin = __llvm_profile_begin_vtabnames();
   const char *VNamesEnd = __llvm_profile_end_vtabnames();
+  uint64_t Version = __llvm_profile_get_version();
   return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin,
                             CountersEnd, BitmapBegin, BitmapEnd, VPDataReader,
                             NamesBegin, NamesEnd, VTableBegin, VTableEnd,
-                            VNamesBegin, VNamesEnd, SkipNameDataWrite);
+                            VNamesBegin, VNamesEnd, SkipNameDataWrite, Version);
 }
 
-COMPILER_RT_VISIBILITY int
-lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
-                   const __llvm_profile_data *DataEnd,
-                   const char *CountersBegin, const char *CountersEnd,
-                   const char *BitmapBegin, const char *BitmapEnd,
-                   VPDataReaderType *VPDataReader, const char *NamesBegin,
-                   const char *NamesEnd, const VTableProfData *VTableBegin,
-                   const VTableProfData *VTableEnd, const char *VNamesBegin,
-                   const char *VNamesEnd, int SkipNameDataWrite) {
+COMPILER_RT_VISIBILITY int lprofWriteDataImpl(
+    ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
+    const __llvm_profile_data *DataEnd, const char *CountersBegin,
+    const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
+    VPDataReaderType *VPDataReader, const char *NamesBegin,
+    const char *NamesEnd, const VTableProfData *VTableBegin,
+    const VTableProfData *VTableEnd, const char *VNamesBegin,
+    const char *VNamesEnd, int SkipNameDataWrite, uint64_t Version) {
   /* Calculate size of sections. */
   const uint64_t DataSectionSize =
       __llvm_profile_get_data_size(DataBegin, DataEnd);
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 2cdfea9a579a4..d51b58386f168 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -152,7 +152,7 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \
 #define INSTR_PROF_DATA_DEFINED
 #endif
 INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic())
-INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version())
+INSTR_PROF_RAW_HEADER(uint64_t, Version, Version)
 INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 INSTR_PROF_RAW_HEADER(uint64_t, NumData, NumData)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters)
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index a8055979acaa2..ea4be07d0c8c8 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -462,7 +462,10 @@ createIRLevelProfileFlagVar(Module &M,
   auto IRLevelVersionVariable = new GlobalVariable(
       M, IntTy64, true, GlobalValue::WeakAnyLinkage,
       Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)), VarName);
-  IRLevelVersionVariable->setVisibility(GlobalValue::HiddenVisibility);
+  if (isGPUProfTarget(M))
+    IRLevelVersionVariable->setVisibility(GlobalValue::ProtectedVisibility);
+  else
+    IRLevelVersionVariable->setVisibility(GlobalValue::HiddenVisibility);
   Triple TT(M.getTargetTriple());
   if (TT.supportsCOMDAT()) {
     IRLevelVersionVariable->setLinkage(GlobalValue::ExternalLinkage);
diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
index 292c582b45c52..f31aa15bfe6c9 100644
--- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test
+++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
@@ -1,7 +1,7 @@
 // Header
 //
 // INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic())
-// INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version())
+// INSTR_PROF_RAW_HEADER(uint64_t, Version, Version)
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
index 705e5efaf5875..44be2980bb2f2 100644
--- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
+++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
@@ -1,7 +1,7 @@
 // Header
 //
 // INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic())
-// INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version())
+// INSTR_PROF_RAW_HEADER(uint64_t, Version, Version)
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
index 157c13b926a7e..9af9d65a6bdba 100644
--- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
+++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
@@ -1,7 +1,7 @@
 // Header
 //
 // INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic())
-// INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version())
+// INSTR_PROF_RAW_HEADER(uint64_t, Version, Version)
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
index 83cf76f68fb63..49c5ae9b0931d 100644
--- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
+++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
@@ -1,7 +1,7 @@
 // Header
 //
 // INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic())
-// INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version())
+// INSTR_PROF_RAW_HEADER(uint64_t, Version, Version)
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index e030ab9e6b61f..5c763d6c71726 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
 #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
 
+#include <optional>
 #include <type_traits>
 
 #include "llvm/ADT/DenseMap.h"
@@ -67,15 +68,16 @@ extern "C" {
 extern int __attribute__((weak)) __llvm_write_custom_profile(
     const char *Target, const __llvm_profile_data *DataBegin,
     const __llvm_profile_data *DataEnd, const char *CountersBegin,
-    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd);
+    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd,
+    const uint64_t *VersionOverride);
 }
-
 /// PGO profiling data extracted from a GPU device
 struct GPUProfGlobals {
   SmallVector<int64_t> Counts;
   SmallVector<__llvm_profile_data> Data;
   SmallVector<uint8_t> NamesData;
   Triple TargetTriple;
+  std::optional<uint64_t> Version;
 
   void dump() const;
   Error write() const;
diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index 8783490831e25..9b9233c95e567 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -16,6 +16,7 @@
 
 #include "Shared/Utils.h"
 
+#include "llvm/ProfileData/InstrProfData.inc"
 #include "llvm/Support/Error.h"
 
 #include <cstring>
@@ -214,6 +215,13 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
       if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal))
         return Err;
       DeviceProfileData.Data.push_back(std::move(Data));
+    } else if (*NameOrErr == INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR)) {
+      uint64_t RawVersionData;
+      GlobalTy RawVersionGlobal(NameOrErr->str(), Sym.getSize(),
+                                &RawVersionData);
+      if (auto Err = readGlobalFromDevice(Device, Image, RawVersionGlobal))
+        return Err;
+      DeviceProfileData.Version = RawVersionData;
     }
   }
   return DeviceProfileData;
@@ -265,7 +273,7 @@ void GPUProfGlobals::dump() const {
 }
 
 Error GPUProfGlobals::write() const {
-  if (!__llvm_write_custom_profile)
+  if (__llvm_write_custom_profile == nullptr)
     return Plugin::error("Could not find symbol __llvm_write_custom_profile. "
                          "The compiler-rt profiling library must be linked for "
                          "GPU PGO to work.");
@@ -274,6 +282,8 @@ Error GPUProfGlobals::write() const {
          CountsSize = Counts.size() * sizeof(int64_t);
   __llvm_profile_data *DataBegin, *DataEnd;
   char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd;
+  const uint64_t *VersionOverride =
+      Version.has_value() ? &Version.value() : nullptr;
 
   // Initialize array of contiguous data. We need to make sure each section is
   // contiguous so that the PGO library can compute deltas properly
@@ -295,9 +305,9 @@ Error GPUProfGlobals::write() const {
   memcpy(NamesBegin, NamesData.data(), NamesData.size());
 
   // Invoke compiler-rt entrypoint
-  int result = __llvm_write_custom_profile(TargetTriple.str().c_str(),
-                                           DataBegin, DataEnd, CountersBegin,
-                                           CountersEnd, NamesBegin, NamesEnd);
+  int result = __llvm_write_custom_profile(
+      TargetTriple.str().c_str(), DataBegin, DataEnd, CountersBegin,
+      CountersEnd, NamesBegin, NamesEnd, VersionOverride);
   if (result != 0)
     return Plugin::error("Error writing GPU PGO data to file");
 
diff --git a/offload/test/offloading/gpupgo/pgo1.c b/offload/test/offloading/gpupgo/pgo1.c
new file mode 100644
index 0000000000000..c8011cbae83c0
--- /dev/null
+++ b/offload/test/offloading/gpupgo/pgo1.c
@@ -0,0 +1,84 @@
+// RUN: %libomptarget-compile-generic -fcreate-profile \
+// RUN:     -Xarch_device -fprofile-generate
+// RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \
+// RUN:     %libomptarget-run-generic 2>&1
+// RUN: llvm-profdata show --all-functions --counts \
+// RUN:     %target_triple.%basename_t.llvm.profraw | \
+// RUN:     %fcheck-generic --check-prefix="LLVM-PGO"
+
+// RUN: %libomptarget-compile-generic -fcreate-profile \
+// RUN:     -Xarch_device -fprofile-instr-generate
+// RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \
+// RUN:     %libomptarget-run-generic 2>&1
+// RUN: llvm-profdata show --all-functions --counts \
+// RUN:     %target_triple.%basename_t.clang.profraw | \
+// RUN:     %fcheck-generic --check-prefix="CLANG-PGO"
+
+// REQUIRES: gpu
+// REQUIRES: pgo
+
+int test1(int a) { return a / 2; }
+int test2(int a) { return a * 2; }
+
+int main() {
+  int m = 2;
+#pragma omp target
+  for (int i = 0; i < 10; i++) {
+    m = test1(m);
+    for (int j = 0; j < 2; j++) {
+      m = test2(m);
+    }
+  }
+}
+
+// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
+// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// LLVM-PGO: Counters: 4
+// LLVM-PGO: Block counts: [20, 10, 2, 1]
+
+// LLVM-PGO-LABEL: test1:
+// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// LLVM-PGO: Counters: 1
+// LLVM-PGO: Block counts: [10]
+
+// LLVM-PGO-LABEL: test2:
+// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// LLVM-PGO: Counters: 1
+// LLVM-PGO: Block counts: [20]
+
+// LLVM-PGO-LABEL: Instrumentation level:
+// LLVM-PGO-SAME: IR
+// LLVM-PGO-SAME: entry_first = 0
+// LLVM-PGO-LABEL: Functions shown:
+// LLVM-PGO-SAME: 3
+// LLVM-PGO-LABEL: Maximum function count:
+// LLVM-PGO-SAME: 20
+// LLVM-PGO-LABEL: Maximum internal block count:
+// LLVM-PGO-SAME: 10
+
+// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
+// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// CLANG-PGO: Counters: 3
+// CLANG-PGO: Function count: 0
+// CLANG-PGO: Block counts: [11, 20]
+
+// CLANG-PGO-LABEL: test1:
+// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// CLANG-PGO: Counters: 1
+// CLANG-PGO: Function count: 10
+// CLANG-PGO: Block counts: []
+
+// CLANG-PGO-LABEL: test2:
+// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// CLANG-PGO: Counters: 1
+// CLANG-PGO: Function count: 20
+// CLANG-PGO: Block counts: []
+
+// CLANG-PGO-LABEL: Instrumentation level:
+// CLANG-PGO-SAME: Front-end
+// CLANG-PGO-LABEL: Functions shown:
+// CLANG-PGO-SAME: 3
+// CLANG-PGO-LABEL: Maximum function count:
+// CLANG-PGO-SAME: 20
+// CLANG-PGO-LABEL: Maximum internal block count:
+// CLANG-PGO-SAME: 20
diff --git a/offload/test/offloading/gpupgo/pgo2.c b/offload/test/offloading/gpupgo/pgo2.c
new file mode 100644
index 0000000000000..35dae74087d92
--- /dev/null
+++ b/offload/test/offloading/gpupgo/pgo2.c
@@ -0,0 +1,76 @@
+// RUN: %libomptarget-compile-generic -fprofile-generate
+// RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \
+// RUN:     %libomptarget-run-generic 2>&1
+// RUN: llvm-profdata show --all-functions --counts \
+// RUN:     %basename_t.llvm.profraw | %fcheck-generic \
+// RUN:     --check-prefix="LLVM-HOST"
+// RUN: llvm-profdata show --all-functions --counts \
+// RUN:     %target_triple.%basename_t.llvm.profraw \
+// RUN:     | %fcheck-generic --check-prefix="LLVM-DEVICE"
+
+// RUN: %libomptarget-compile-generic -fprofile-instr-generate
+// RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \
+// RUN:     %libomptarget-run-generic 2>&1
+// RUN: llvm-profdata show --all-functions --counts \
+// RUN:     %basename_t.clang.profraw | %fcheck-generic \
+// RUN:     --check-prefix="CLANG-HOST"
+// RUN: llvm-profdata show --all-functions --counts \
+// RUN:     %target_triple.%basename_t.clang.profraw | \
+// RUN:     %fcheck-generic --check-prefix="CLANG-DEV"
+
+// RUN: %libomptarget-compile-generic -Xarch_host -fprofile-generate
+// RUN: env LLVM_PROFILE_FILE=%basename_t.nogpu.profraw \
+// RUN:     %libomptarget-run-generic 2>&1
+// RUN: llvm-profdata show --all-functions --counts \
+// RUN:     %basename_t.nogpu.profraw | %fcheck-generic \
+// RUN:     --check-prefix="LLVM-HOST"
+// RUN: not test -e %target_triple.%basename_t.nogpu.profraw
+
+// REQUIRES: gpu
+// REQUIRES: pgo
+
+int main() {
+  int host_var = 0;
+  for (int i = 0; i < 20; i++) {
+    host_var += i;
+  }
+
+  int device_var = 1;
+#pragma omp target
+  for (int i = 0; i < 10; i++) {
+    device_var *= i;
+  }
+}
+
+// LLVM-HOST-LABEL: main:
+// LLVM-HOST: Hash: {{0[xX][0-9a-fA-F]+}}
+// LLVM-HOST: Counters: 3
+// LLVM-HOST: Block counts: [20, 1, 0]
+
+// LLVM-HOST-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
+// LLVM-HOST: Hash: {{0[xX][0-9a-fA-F]+}}
+// LLVM-HOST: Counters: 2
+// LLVM-HOST: Block counts: [0, 0]
+
+// LLVM-DEVICE-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
+// LLVM-DEVICE: Hash: {{0[xX][0-9a-fA-F]+}}
+// LLVM-DEVICE: Counters: 3
+// LLVM-DEVICE: Block counts: [10, 2, 1]
+
+// CLANG-HOST-LABEL: main:
+// CLANG-HOST: Hash: {{0[xX][0-9a-fA-F]+}}
+// CLANG-HOST: Counters: 2
+// CLANG-HOST: Function count: 1
+// CLANG-HOST: Block counts: [20]
+
+// CLANG-HOST-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
+// CLANG-HOST: Hash: {{0[xX][0-9a-fA-F]+}}
+// CLANG-HOST: Counters: 2
+// CLANG-HOST: Function count: 0
+// CLANG-HOST: Block counts: [0]
+
+// CLANG-DEV-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
+// CLANG-DEV: Hash: {{0[xX][0-9a-fA-F]+}}
+// CLANG-DEV: Counters: 2
+// CLANG-DEV: Function count: 0
+// CLANG-DEV: Block counts: [11]
diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c
deleted file mode 100644
index 6fe4487ffb67f..0000000000000
--- a/offload/test/offloading/pgo1.c
+++ /dev/null
@@ -1,66 +0,0 @@
-// RUN: %libomptarget-compile-generic -fprofile-generate \
-// RUN:     -Xclang "-fprofile-instrument=llvm"
-// RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1
-// RUN: %profdata show --all-functions --counts \
-// RUN:     %target_triple.llvm.profraw | %fcheck-generic \
-// RUN:     --check-prefix="LLVM-PGO"
-
-// RUN: %libomptarget-compile-generic -fprofile-instr-generate \
-// RUN:     -Xclang "-fprofile-instrument=clang"
-// RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1
-// RUN: %profdata show --all-functions --counts \
-// RUN:     %target_triple.clang.profraw | %fcheck-generic \
-// RUN:     --check-prefix="CLANG-PGO"
-
-// REQUIRES: gpu
-// REQUIRES: pgo
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-int test1(int a) { return a / 2; }
-int test2(int a) { return a * 2; }
-
-int main() {
-  int m = 2;
-#pragma omp target
-  for (int i = 0; i < 10; i++) {
-    m = test1(m);
-    for (int j = 0; j < 2; j++) {
-      m = test2(m);
-    }
-  }
-}
-// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
-// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
-// LLVM-PGO: Counters: 4
-// LLVM-PGO: Block counts: [20, 10, 2, 1]
-
-// LLVM-PGO-LABEL: test1:
-// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
-// LLVM-PGO: Counters: 1
-// LLVM-PGO: Block counts: [10]
-
-// LLVM-PGO-LABEL: test2:
-// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
-// LLVM-PGO: Counters: 1
-// LLVM-PGO: Block counts: [20]
-
-// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
-// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
-// CLANG-PGO: Counters: 3
-// CLANG-PGO: Function count: 0
-// CLANG-PGO: Block counts: [11, 20]
-
-// CLANG-PGO-LABEL: test1:
-// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
-// CLANG-PGO: Counters: 1
-// CLANG-PGO: Function count: 10
-// CLANG-PGO: Block counts: []
-
-// CLANG-PGO-LABEL: test2:
-// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
-// CLANG-PGO: Counters: 1
-// CLANG-PGO: Function count: 20
-// CLANG-PGO: Block counts: []

>From 3fcadedd08c7a824ecc1d59f8334559be8157d55 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Tue, 18 Mar 2025 16:41:49 -0500
Subject: [PATCH 2/3] Revert == nullptr check to !

---
 offload/plugins-nextgen/common/src/GlobalHandler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index 9b9233c95e567..89f59f56f09e8 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -273,7 +273,7 @@ void GPUProfGlobals::dump() const {
 }
 
 Error GPUProfGlobals::write() const {
-  if (__llvm_write_custom_profile == nullptr)
+  if (!__llvm_write_custom_profile)
     return Plugin::error("Could not find symbol __llvm_write_custom_profile. "
                          "The compiler-rt profiling library must be linked for "
                          "GPU PGO to work.");

>From 298dafc5e9553eeb69e9a52b7e9367153258db24 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 19 Mar 2025 00:37:37 -0500
Subject: [PATCH 3/3] Fix version extraction

---
 clang/lib/CodeGen/CodeGenPGO.cpp                           | 3 +++
 llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp | 7 ++++---
 offload/plugins-nextgen/common/include/GlobalHandler.h     | 3 +--
 offload/plugins-nextgen/common/src/GlobalHandler.cpp       | 4 +---
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 792373839107f..ff8b1339de966 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -1357,6 +1357,9 @@ void CodeGenPGO::setProfileVersion(llvm::Module &M) {
 
     IRLevelVersionVariable->setVisibility(llvm::GlobalValue::HiddenVisibility);
     llvm::Triple TT(M.getTargetTriple());
+    if (TT.isAMDGPU() || TT.isNVPTX())
+      IRLevelVersionVariable->setVisibility(
+          llvm::GlobalValue::ProtectedVisibility);
     if (TT.supportsCOMDAT()) {
       IRLevelVersionVariable->setLinkage(llvm::GlobalValue::ExternalLinkage);
       IRLevelVersionVariable->setComdat(M.getOrInsertComdat(VarName));
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index ea4be07d0c8c8..bc704b3f89c44 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -462,10 +462,11 @@ createIRLevelProfileFlagVar(Module &M,
   auto IRLevelVersionVariable = new GlobalVariable(
       M, IntTy64, true, GlobalValue::WeakAnyLinkage,
       Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)), VarName);
+  IRLevelVersionVariable->setVisibility(GlobalValue::HiddenVisibility);
   if (isGPUProfTarget(M))
-    IRLevelVersionVariable->setVisibility(GlobalValue::ProtectedVisibility);
-  else
-    IRLevelVersionVariable->setVisibility(GlobalValue::HiddenVisibility);
+    IRLevelVersionVariable->setVisibility(
+        llvm::GlobalValue::ProtectedVisibility);
+
   Triple TT(M.getTargetTriple());
   if (TT.supportsCOMDAT()) {
     IRLevelVersionVariable->setLinkage(GlobalValue::ExternalLinkage);
diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index 5c763d6c71726..6def53430a7c0 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
 #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
 
-#include <optional>
 #include <type_traits>
 
 #include "llvm/ADT/DenseMap.h"
@@ -77,7 +76,7 @@ struct GPUProfGlobals {
   SmallVector<__llvm_profile_data> Data;
   SmallVector<uint8_t> NamesData;
   Triple TargetTriple;
-  std::optional<uint64_t> Version;
+  uint64_t Version = INSTR_PROF_RAW_VERSION;
 
   void dump() const;
   Error write() const;
diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index 89f59f56f09e8..35a70d8eff901 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -282,8 +282,6 @@ Error GPUProfGlobals::write() const {
          CountsSize = Counts.size() * sizeof(int64_t);
   __llvm_profile_data *DataBegin, *DataEnd;
   char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd;
-  const uint64_t *VersionOverride =
-      Version.has_value() ? &Version.value() : nullptr;
 
   // Initialize array of contiguous data. We need to make sure each section is
   // contiguous so that the PGO library can compute deltas properly
@@ -307,7 +305,7 @@ Error GPUProfGlobals::write() const {
   // Invoke compiler-rt entrypoint
   int result = __llvm_write_custom_profile(
       TargetTriple.str().c_str(), DataBegin, DataEnd, CountersBegin,
-      CountersEnd, NamesBegin, NamesEnd, VersionOverride);
+      CountersEnd, NamesBegin, NamesEnd, &Version);
   if (result != 0)
     return Plugin::error("Error writing GPU PGO data to file");
 



More information about the llvm-commits mailing list