[clang] [compiler-rt] [llvm] [openmp] [PGO][Offload] Profile profraw generation for GPU instrumentation #76587 (PR #93365)

Ethan Luis McDonough via cfe-commits cfe-commits at lists.llvm.org
Fri Oct 25 11:47:34 PDT 2024


https://github.com/EthanLuisMcDonough updated https://github.com/llvm/llvm-project/pull/93365

>From 530eb982b9770190377bb0bd09c5cb715f34d484 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 15 Dec 2023 20:38:38 -0600
Subject: [PATCH 01/38] Add profiling functions to libomptarget

---
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |  3 +++
 openmp/libomptarget/DeviceRTL/CMakeLists.txt  |  2 ++
 .../DeviceRTL/include/Profiling.h             | 21 +++++++++++++++++++
 .../libomptarget/DeviceRTL/src/Profiling.cpp  | 19 +++++++++++++++++
 4 files changed, 45 insertions(+)
 create mode 100644 openmp/libomptarget/DeviceRTL/include/Profiling.h
 create mode 100644 openmp/libomptarget/DeviceRTL/src/Profiling.cpp

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index d22d2a8e948b00..1d887d5cb58127 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -503,6 +503,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
 __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
 
+__OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr)
+__OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64)
+
 __OMP_RTL(__last, false, Void, )
 
 #undef __OMP_RTL
diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
index 1ce3e1e40a80ab..55ee15d068c67b 100644
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -89,6 +89,7 @@ set(include_files
   ${include_directory}/Interface.h
   ${include_directory}/LibC.h
   ${include_directory}/Mapping.h
+  ${include_directory}/Profiling.h
   ${include_directory}/State.h
   ${include_directory}/Synchronization.h
   ${include_directory}/Types.h
@@ -104,6 +105,7 @@ set(src_files
   ${source_directory}/Mapping.cpp
   ${source_directory}/Misc.cpp
   ${source_directory}/Parallelism.cpp
+  ${source_directory}/Profiling.cpp
   ${source_directory}/Reduction.cpp
   ${source_directory}/State.cpp
   ${source_directory}/Synchronization.cpp
diff --git a/openmp/libomptarget/DeviceRTL/include/Profiling.h b/openmp/libomptarget/DeviceRTL/include/Profiling.h
new file mode 100644
index 00000000000000..68c7744cd60752
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Profiling.h
@@ -0,0 +1,21 @@
+//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_PROFILING_H
+#define OMPTARGET_DEVICERTL_PROFILING_H
+
+extern "C" {
+
+void __llvm_profile_register_function(void *ptr);
+void __llvm_profile_register_names_function(void *ptr, long int i);
+}
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/src/Profiling.cpp b/openmp/libomptarget/DeviceRTL/src/Profiling.cpp
new file mode 100644
index 00000000000000..799477f5e47d27
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Profiling.cpp
@@ -0,0 +1,19 @@
+//===------- Profiling.cpp ---------------------------------------- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Profiling.h"
+
+#pragma omp begin declare target device_type(nohost)
+
+extern "C" {
+
+void __llvm_profile_register_function(void *ptr) {}
+void __llvm_profile_register_names_function(void *ptr, long int i) {}
+}
+
+#pragma omp end declare target

>From fb067d4ffe604fd68cf90b705db1942bce49dbb1 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Sat, 16 Dec 2023 01:18:41 -0600
Subject: [PATCH 02/38] Fix PGO instrumentation for GPU targets

---
 clang/lib/CodeGen/CodeGenPGO.cpp                      | 10 ++++++++--
 .../lib/Transforms/Instrumentation/InstrProfiling.cpp | 11 ++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 81bf8ea696b164..edae6885b528ac 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -959,8 +959,14 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
 
   unsigned Counter = (*RegionCounterMap)[S];
 
-  llvm::Value *Args[] = {FuncNameVar,
-                         Builder.getInt64(FunctionHash),
+  // Make sure that pointer to global is passed in with zero addrspace
+  // This is relevant during GPU profiling
+  auto *I8Ty = llvm::Type::getInt8Ty(CGM.getLLVMContext());
+  auto *I8PtrTy = llvm::PointerType::getUnqual(I8Ty);
+  auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+      FuncNameVar, I8PtrTy);
+
+  llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash),
                          Builder.getInt32(NumRegionCounters),
                          Builder.getInt32(Counter), StepV};
   if (!StepV)
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index fe5a0578bd9721..d2cb8155c17967 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1658,10 +1658,13 @@ void InstrLowerer::emitRegistration() {
   IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", RegisterF));
   for (Value *Data : CompilerUsedVars)
     if (!isa<Function>(Data))
-      IRB.CreateCall(RuntimeRegisterF, Data);
+      // Check for addrspace cast when profiling GPU
+      IRB.CreateCall(RuntimeRegisterF,
+                     IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
   for (Value *Data : UsedVars)
     if (Data != NamesVar && !isa<Function>(Data))
-      IRB.CreateCall(RuntimeRegisterF, Data);
+      IRB.CreateCall(RuntimeRegisterF,
+                     IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
 
   if (NamesVar) {
     Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
@@ -1670,7 +1673,9 @@ void InstrLowerer::emitRegistration() {
     auto *NamesRegisterF =
         Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
                          getInstrProfNamesRegFuncName(), M);
-    IRB.CreateCall(NamesRegisterF, {NamesVar, IRB.getInt64(NamesSize)});
+    IRB.CreateCall(NamesRegisterF, {IRB.CreatePointerBitCastOrAddrSpaceCast(
+                                        NamesVar, VoidPtrTy),
+                                    IRB.getInt64(NamesSize)});
   }
 
   IRB.CreateRetVoid();

>From 7a0e0efa178cc4de6a22a8f5cc3f53cd1c81ea3a Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Thu, 21 Dec 2023 00:25:46 -0600
Subject: [PATCH 03/38] Change global visibility on GPU targets

---
 llvm/include/llvm/ProfileData/InstrProf.h       |  4 ++++
 llvm/lib/ProfileData/InstrProf.cpp              | 17 +++++++++++++++--
 .../Instrumentation/InstrProfiling.cpp          | 15 +++++++++++----
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 288dc71d756aee..bf9899d867e3dd 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -171,6 +171,10 @@ inline StringRef getInstrProfCounterBiasVarName() {
 /// Return the marker used to separate PGO names during serialization.
 inline StringRef getInstrProfNameSeparator() { return "\01"; }
 
+/// Determines whether module targets a GPU eligable for PGO
+/// instrumentation
+bool isGPUProfTarget(const Module &M);
+
 /// Return the modified name for function \c F suitable to be
 /// used the key for profile lookup. Variable \c InLTO indicates if this
 /// is called in LTO optimization passes.
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 649d814cfd9de0..0d6717aeb0142c 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -410,13 +410,22 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
   return VarName;
 }
 
+bool isGPUProfTarget(const Module &M) {
+  const auto &triple = M.getTargetTriple();
+  return triple.rfind("nvptx", 0) == 0 || triple.rfind("amdgcn", 0) == 0 ||
+         triple.rfind("r600", 0) == 0;
+}
+
 GlobalVariable *createPGOFuncNameVar(Module &M,
                                      GlobalValue::LinkageTypes Linkage,
                                      StringRef PGOFuncName) {
+  // Ensure profiling variables on GPU are visible to be read from host
+  if (isGPUProfTarget(M))
+    Linkage = GlobalValue::ExternalLinkage;
   // We generally want to match the function's linkage, but available_externally
   // and extern_weak both have the wrong semantics, and anything that doesn't
   // need to link across compilation units doesn't need to be visible at all.
-  if (Linkage == GlobalValue::ExternalWeakLinkage)
+  else if (Linkage == GlobalValue::ExternalWeakLinkage)
     Linkage = GlobalValue::LinkOnceAnyLinkage;
   else if (Linkage == GlobalValue::AvailableExternallyLinkage)
     Linkage = GlobalValue::LinkOnceODRLinkage;
@@ -430,8 +439,12 @@ GlobalVariable *createPGOFuncNameVar(Module &M,
       new GlobalVariable(M, Value->getType(), true, Linkage, Value,
                          getPGOFuncNameVarName(PGOFuncName, Linkage));
 
+  // If the target is a GPU, make the symbol protected so it can
+  // be read from the host device
+  if (isGPUProfTarget(M))
+    FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility);
   // Hide the symbol so that we correctly get a copy for each executable.
-  if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
+  else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
     FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
 
   return FuncNameVar;
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index d2cb8155c17967..3b582b65190808 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1481,6 +1481,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
     Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
 
+  if (isGPUProfTarget(M)) {
+    Linkage = GlobalValue::ExternalLinkage;
+    Visibility = GlobalValue::ProtectedVisibility;
+  }
   // If the data variable is not referenced by code (if we don't emit
   // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the
   // data variable live under linker GC, the data variable can be private. This
@@ -1492,9 +1496,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees
   // that other copies must have the same CFG and cannot have value profiling.
   // If no hash suffix, other profd copies may be referenced by code.
-  if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
-      (TT.isOSBinFormatELF() ||
-       (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
+  else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
+           (TT.isOSBinFormatELF() ||
+            (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
     Linkage = GlobalValue::PrivateLinkage;
     Visibility = GlobalValue::DefaultVisibility;
   }
@@ -1696,7 +1700,10 @@ bool InstrLowerer::emitRuntimeHook() {
   auto *Var =
       new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
                          nullptr, getInstrProfRuntimeHookVarName());
-  Var->setVisibility(GlobalValue::HiddenVisibility);
+  if (isGPUProfTarget(M))
+    Var->setVisibility(GlobalValue::ProtectedVisibility);
+  else
+    Var->setVisibility(GlobalValue::HiddenVisibility);
 
   if (TT.isOSBinFormatELF() && !TT.isPS()) {
     // Mark the user variable as used so that it isn't stripped out.

>From fddc07908ed9aa698fe3250ddbfc5621ab4d049d Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 22 Dec 2023 23:43:29 -0600
Subject: [PATCH 04/38] Make names global public on GPU

---
 llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 3b582b65190808..61fba7be3ee0ee 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1621,6 +1621,13 @@ void InstrLowerer::emitNameData() {
   NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
                                 GlobalValue::PrivateLinkage, NamesVal,
                                 getInstrProfNamesVarName());
+
+  // Make names variable public if current target is a GPU
+  if (isGPUProfTarget(M)) {
+    NamesVar->setLinkage(GlobalValue::ExternalLinkage);
+    NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility);
+  }
+
   NamesSize = CompressedNameStr.size();
   setGlobalVariableLargeSection(TT, *NamesVar);
   NamesVar->setSection(

>From e9db03c70bf79f4f4ddad4b48a5aa63a37e0d4f6 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 29 Dec 2023 12:54:50 -0600
Subject: [PATCH 05/38] Read and print GPU device PGO globals

---
 .../common/include/GlobalHandler.h            | 27 ++++++
 .../common/src/GlobalHandler.cpp              | 82 +++++++++++++++++++
 .../common/src/PluginInterface.cpp            | 14 ++++
 3 files changed, 123 insertions(+)

diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index fa079ac9660ee0..a82cd536487653 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -14,9 +14,11 @@
 #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
 
 #include <string>
+#include <vector>
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ProfileData/InstrProf.h"
 
 #include "Shared/Debug.h"
 #include "Shared/Utils.h"
@@ -58,6 +60,22 @@ class GlobalTy {
   void setPtr(void *P) { Ptr = P; }
 };
 
+typedef void *IntPtrT;
+struct __llvm_profile_data {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+/// PGO profiling data extracted from a GPU device
+struct GPUProfGlobals {
+  std::string names;
+  std::vector<std::vector<int64_t>> counts;
+  std::vector<__llvm_profile_data> data;
+  Triple targetTriple;
+
+  void dump() const;
+};
+
 /// Subclass of GlobalTy that holds the memory for a global of \p Ty.
 template <typename Ty> class StaticGlobalTy : public GlobalTy {
   Ty Data;
@@ -172,6 +190,15 @@ class GenericGlobalHandlerTy {
     return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
                                           /* D2H */ false);
   }
+
+  /// Checks whether a given image contains profiling globals.
+  bool hasProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image);
+
+  /// Reads profiling data from a GPU image to supplied profdata struct.
+  /// Iterates through the image symbol table and stores global values
+  /// with profiling prefixes.
+  Expected<GPUProfGlobals> readProfilingGlobals(GenericDeviceTy &Device,
+                                                DeviceImageTy &Image);
 };
 
 } // namespace plugin
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index 3a272e228c7dfe..5dd5daec468ca5 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -176,3 +176,85 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
 
   return Plugin::success();
 }
+
+bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device,
+                                                 DeviceImageTy &Image) {
+  GlobalTy global(getInstrProfNamesVarName().str(), 0);
+  if (auto Err = getGlobalMetadataFromImage(Device, Image, global)) {
+    consumeError(std::move(Err));
+    return false;
+  }
+  return true;
+}
+
+Expected<GPUProfGlobals>
+GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
+                                             DeviceImageTy &Image) {
+  GPUProfGlobals profdata;
+  const auto *elf = getOrCreateELFObjectFile(Device, Image);
+  profdata.targetTriple = elf->makeTriple();
+  // Iterate through
+  for (auto &sym : elf->symbols()) {
+    if (auto name = sym.getName()) {
+      // Check if given current global is a profiling global based
+      // on name
+      if (name->equals(getInstrProfNamesVarName())) {
+        // Read in profiled function names
+        std::vector<char> chars(sym.getSize() / sizeof(char), ' ');
+        GlobalTy NamesGlobal(name->str(), sym.getSize(), chars.data());
+        if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal))
+          return Err;
+        std::string names(chars.begin(), chars.end());
+        profdata.names = std::move(names);
+      } else if (name->starts_with(getInstrProfCountersVarPrefix())) {
+        // Read global variable profiling counts
+        std::vector<int64_t> counts(sym.getSize() / sizeof(int64_t), 0);
+        GlobalTy CountGlobal(name->str(), sym.getSize(), counts.data());
+        if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
+          return Err;
+        profdata.counts.push_back(std::move(counts));
+      } else if (name->starts_with(getInstrProfDataVarPrefix())) {
+        // Read profiling data for this global variable
+        __llvm_profile_data data{};
+        GlobalTy DataGlobal(name->str(), sym.getSize(), &data);
+        if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal))
+          return Err;
+        profdata.data.push_back(std::move(data));
+      }
+    }
+  }
+  return profdata;
+}
+
+void GPUProfGlobals::dump() const {
+  llvm::outs() << "======= GPU Profile =======\nTarget: " << targetTriple.str()
+               << "\n";
+
+  llvm::outs() << "======== Counters =========\n";
+  for (const auto &count : counts) {
+    llvm::outs() << "[";
+    for (size_t i = 0; i < count.size(); i++) {
+      if (i == 0)
+        llvm::outs() << " ";
+      llvm::outs() << count[i] << " ";
+    }
+    llvm::outs() << "]\n";
+  }
+
+  llvm::outs() << "========== Data ===========\n";
+  for (const auto &d : data) {
+    llvm::outs() << "{ ";
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer)                     \
+  llvm::outs() << d.Name << " ";
+#include "llvm/ProfileData/InstrProfData.inc"
+    llvm::outs() << " }\n";
+  }
+
+  llvm::outs() << "======== Functions ========\n";
+  InstrProfSymtab symtab;
+  if (Error Err = symtab.create(StringRef(names))) {
+    consumeError(std::move(Err));
+  }
+  symtab.dumpNames(llvm::outs());
+  llvm::outs() << "===========================\n";
+}
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 3c7d1ca8998787..84ed90f03f84f1 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -811,6 +811,20 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
            DeviceMemoryPoolTracking.AllocationMax);
   }
 
+  for (auto *Image : LoadedImages) {
+    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
+    if (!Handler.hasProfilingGlobals(*this, *Image))
+      continue;
+
+    GPUProfGlobals profdata;
+    auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
+    if (!ProfOrErr)
+      return ProfOrErr.takeError();
+
+    // TODO: write data to profiling file
+    ProfOrErr->dump();
+  }
+
   // Delete the memory manager before deinitializing the device. Otherwise,
   // we may delete device allocations after the device is deinitialized.
   if (MemoryManager)

>From e4687605d1a6ca932312025826db09dba84845a3 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 3 Jan 2024 17:06:15 -0600
Subject: [PATCH 06/38] Fix rebase bug

---
 .../plugins-nextgen/common/src/GlobalHandler.cpp       | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index cb71b61f4a9c4f..86742d0f77a2fe 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -178,10 +178,12 @@ Expected<GPUProfGlobals>
 GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
                                              DeviceImageTy &Image) {
   GPUProfGlobals profdata;
-  const auto *elf = getOrCreateELFObjectFile(Device, Image);
-  profdata.targetTriple = elf->makeTriple();
-  // Iterate through
-  for (auto &sym : elf->symbols()) {
+  auto ELFObj = getELFObjectFile(Image);
+  if (!ELFObj)
+    return ELFObj.takeError();
+  profdata.targetTriple = ELFObj->makeTriple();
+  // Iterate through elf symbols
+  for (auto &sym : ELFObj->symbols()) {
     if (auto name = sym.getName()) {
       // Check if given current global is a profiling global based
       // on name

>From ec18ce94c227e1d43927955fa1c67360ecfcfca6 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 3 Jan 2024 17:10:19 -0600
Subject: [PATCH 07/38] Refactor portions to be more idiomatic

---
 clang/lib/CodeGen/CodeGenPGO.cpp   | 4 +---
 llvm/lib/ProfileData/InstrProf.cpp | 5 ++---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index edae6885b528ac..7bfcec43ee4c98 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -961,10 +961,8 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
 
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *I8Ty = llvm::Type::getInt8Ty(CGM.getLLVMContext());
-  auto *I8PtrTy = llvm::PointerType::getUnqual(I8Ty);
   auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-      FuncNameVar, I8PtrTy);
+      FuncNameVar, llvm::PointerType::getUnqual(CGM.getLLVMContext()));
 
   llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash),
                          Builder.getInt32(NumRegionCounters),
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index cdcd6840bb5108..1d88da16a5ff9c 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -429,9 +429,8 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
 }
 
 bool isGPUProfTarget(const Module &M) {
-  const auto &triple = M.getTargetTriple();
-  return triple.rfind("nvptx", 0) == 0 || triple.rfind("amdgcn", 0) == 0 ||
-         triple.rfind("r600", 0) == 0;
+  const auto &Triple = llvm::Triple(M.getTargetTriple());
+  return Triple.isAMDGPU() || Triple.isNVPTX();
 }
 
 GlobalVariable *createPGOFuncNameVar(Module &M,

>From 0872556f597056361b0a2c23cdd0be3d9745aef3 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 3 Jan 2024 17:18:47 -0600
Subject: [PATCH 08/38] Reformat DeviceRTL prof functions

---
 openmp/libomptarget/DeviceRTL/include/Profiling.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/openmp/libomptarget/DeviceRTL/include/Profiling.h b/openmp/libomptarget/DeviceRTL/include/Profiling.h
index 68c7744cd60752..9efc1554c176bc 100644
--- a/openmp/libomptarget/DeviceRTL/include/Profiling.h
+++ b/openmp/libomptarget/DeviceRTL/include/Profiling.h
@@ -13,9 +13,8 @@
 #define OMPTARGET_DEVICERTL_PROFILING_H
 
 extern "C" {
-
-void __llvm_profile_register_function(void *ptr);
-void __llvm_profile_register_names_function(void *ptr, long int i);
+void __llvm_profile_register_function(void *Ptr);
+void __llvm_profile_register_names_function(void *Ptr, long int I);
 }
 
 #endif

>From 62f31d1c71b5d100f38d6dc584cc138b3904581b Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Tue, 9 Jan 2024 11:52:29 -0600
Subject: [PATCH 09/38] Style changes + catch name error

---
 .../common/include/GlobalHandler.h            | 16 ++--
 .../common/src/GlobalHandler.cpp              | 87 ++++++++++---------
 2 files changed, 56 insertions(+), 47 deletions(-)

diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index a803b3f76d8b25..755bb23a414e37 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -13,8 +13,7 @@
 #ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
 #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
 
-#include <string>
-#include <vector>
+#include <type_traits>
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Object/ELFObjectFile.h"
@@ -60,18 +59,19 @@ class GlobalTy {
   void setPtr(void *P) { Ptr = P; }
 };
 
-typedef void *IntPtrT;
+using IntPtrT = void *;
 struct __llvm_profile_data {
-#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) Type Name;
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer)                     \
+  std::remove_const<Type>::type Name;
 #include "llvm/ProfileData/InstrProfData.inc"
 };
 
 /// PGO profiling data extracted from a GPU device
 struct GPUProfGlobals {
-  std::string names;
-  std::vector<std::vector<int64_t>> counts;
-  std::vector<__llvm_profile_data> data;
-  Triple targetTriple;
+  SmallVector<uint8_t> NamesData;
+  SmallVector<SmallVector<int64_t>> Counts;
+  SmallVector<__llvm_profile_data> Data;
+  Triple TargetTriple;
 
   void dump() const;
 };
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index 86742d0f77a2fe..7cb672e7b26839 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/Error.h"
 
 #include <cstring>
+#include <string>
 
 using namespace llvm;
 using namespace omp;
@@ -177,73 +178,81 @@ bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device,
 Expected<GPUProfGlobals>
 GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
                                              DeviceImageTy &Image) {
-  GPUProfGlobals profdata;
+  GPUProfGlobals DeviceProfileData;
   auto ELFObj = getELFObjectFile(Image);
   if (!ELFObj)
     return ELFObj.takeError();
-  profdata.targetTriple = ELFObj->makeTriple();
+  DeviceProfileData.TargetTriple = ELFObj->makeTriple();
+
   // Iterate through elf symbols
-  for (auto &sym : ELFObj->symbols()) {
-    if (auto name = sym.getName()) {
-      // Check if given current global is a profiling global based
-      // on name
-      if (name->equals(getInstrProfNamesVarName())) {
-        // Read in profiled function names
-        std::vector<char> chars(sym.getSize() / sizeof(char), ' ');
-        GlobalTy NamesGlobal(name->str(), sym.getSize(), chars.data());
-        if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal))
-          return Err;
-        std::string names(chars.begin(), chars.end());
-        profdata.names = std::move(names);
-      } else if (name->starts_with(getInstrProfCountersVarPrefix())) {
-        // Read global variable profiling counts
-        std::vector<int64_t> counts(sym.getSize() / sizeof(int64_t), 0);
-        GlobalTy CountGlobal(name->str(), sym.getSize(), counts.data());
-        if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
-          return Err;
-        profdata.counts.push_back(std::move(counts));
-      } else if (name->starts_with(getInstrProfDataVarPrefix())) {
-        // Read profiling data for this global variable
-        __llvm_profile_data data{};
-        GlobalTy DataGlobal(name->str(), sym.getSize(), &data);
-        if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal))
-          return Err;
-        profdata.data.push_back(std::move(data));
-      }
+  for (auto &Sym : ELFObj->symbols()) {
+    auto NameOrErr = Sym.getName();
+    if (!NameOrErr)
+      return ELFObj.takeError();
+
+    // Check if given current global is a profiling global based
+    // on name
+    if (NameOrErr->equals(getInstrProfNamesVarName())) {
+      // Read in profiled function names
+      DeviceProfileData.NamesData = SmallVector<uint8_t>(Sym.getSize(), 0);
+      GlobalTy NamesGlobal(NameOrErr->str(), Sym.getSize(),
+                           DeviceProfileData.NamesData.data());
+      if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal))
+        return Err;
+    } else if (NameOrErr->starts_with(getInstrProfCountersVarPrefix())) {
+      // Read global variable profiling counts
+      SmallVector<int64_t> Counts(Sym.getSize() / sizeof(int64_t), 0);
+      GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data());
+      if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
+        return Err;
+      DeviceProfileData.Counts.push_back(std::move(Counts));
+    } else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) {
+      // Read profiling data for this global variable
+      __llvm_profile_data Data{};
+      GlobalTy DataGlobal(NameOrErr->str(), Sym.getSize(), &Data);
+      if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal))
+        return Err;
+      DeviceProfileData.Data.push_back(std::move(Data));
     }
   }
-  return profdata;
+  return DeviceProfileData;
 }
 
 void GPUProfGlobals::dump() const {
-  llvm::outs() << "======= GPU Profile =======\nTarget: " << targetTriple.str()
+  llvm::outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str()
                << "\n";
 
   llvm::outs() << "======== Counters =========\n";
-  for (const auto &count : counts) {
+  for (const auto &Count : Counts) {
     llvm::outs() << "[";
-    for (size_t i = 0; i < count.size(); i++) {
+    for (size_t i = 0; i < Count.size(); i++) {
       if (i == 0)
         llvm::outs() << " ";
-      llvm::outs() << count[i] << " ";
+      llvm::outs() << Count[i] << " ";
     }
     llvm::outs() << "]\n";
   }
 
   llvm::outs() << "========== Data ===========\n";
-  for (const auto &d : data) {
+  for (const auto &ProfData : Data) {
     llvm::outs() << "{ ";
 #define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer)                     \
-  llvm::outs() << d.Name << " ";
+  llvm::outs() << ProfData.Name << " ";
 #include "llvm/ProfileData/InstrProfData.inc"
     llvm::outs() << " }\n";
   }
 
   llvm::outs() << "======== Functions ========\n";
-  InstrProfSymtab symtab;
-  if (Error Err = symtab.create(StringRef(names))) {
+  std::string s;
+  s.reserve(NamesData.size());
+  for (uint8_t Name : NamesData) {
+    s.push_back((char)Name);
+  }
+
+  InstrProfSymtab Symtab;
+  if (Error Err = Symtab.create(StringRef(s))) {
     consumeError(std::move(Err));
   }
-  symtab.dumpNames(llvm::outs());
+  Symtab.dumpNames(llvm::outs());
   llvm::outs() << "===========================\n";
 }

>From 0c4bbeb54d189c1461affd37853aa86c3e3ca7d8 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 17 Jan 2024 19:59:06 -0600
Subject: [PATCH 10/38] Add GPU PGO test

---
 .../common/src/GlobalHandler.cpp              |  2 +-
 openmp/libomptarget/test/CMakeLists.txt       |  6 +++
 openmp/libomptarget/test/lit.cfg              |  3 ++
 openmp/libomptarget/test/lit.site.cfg.in      |  2 +-
 openmp/libomptarget/test/offloading/pgo1.c    | 39 +++++++++++++++++++
 5 files changed, 50 insertions(+), 2 deletions(-)
 create mode 100644 openmp/libomptarget/test/offloading/pgo1.c

diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index 7cb672e7b26839..e5eb653d022287 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -239,7 +239,7 @@ void GPUProfGlobals::dump() const {
 #define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer)                     \
   llvm::outs() << ProfData.Name << " ";
 #include "llvm/ProfileData/InstrProfData.inc"
-    llvm::outs() << " }\n";
+    llvm::outs() << "}\n";
   }
 
   llvm::outs() << "======== Functions ========\n";
diff --git a/openmp/libomptarget/test/CMakeLists.txt b/openmp/libomptarget/test/CMakeLists.txt
index a0ba233eaa5726..21233f3e252eb5 100644
--- a/openmp/libomptarget/test/CMakeLists.txt
+++ b/openmp/libomptarget/test/CMakeLists.txt
@@ -12,6 +12,12 @@ else()
   set(LIBOMPTARGET_DEBUG False)
 endif()
 
+if (OPENMP_STANDALONE_BUILD)
+  set(LIBOMPTARGET_TEST_GPU_PGO False)
+else()
+  set(LIBOMPTARGET_TEST_GPU_PGO True)
+endif()
+
 # Replace the space from user's input with ";" in case that CMake add escape
 # char into the lit command.
 string(REPLACE " " ";" LIBOMPTARGET_LIT_ARG_LIST "${LIBOMPTARGET_LIT_ARGS}")
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 19c5e5c4572227..49743f9fed7f29 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -104,6 +104,9 @@ config.available_features.add(config.libomptarget_current_target)
 if config.libomptarget_has_libc:
   config.available_features.add('libc')
 
+if config.libomptarget_test_pgo:
+  config.available_features.add('pgo')
+
 # Determine whether the test system supports unified memory.
 # For CUDA, this is the case with compute capability 70 (Volta) or higher.
 # For all other targets, we currently assume it is.
diff --git a/openmp/libomptarget/test/lit.site.cfg.in b/openmp/libomptarget/test/lit.site.cfg.in
index 2d638118838727..494d1636af304a 100644
--- a/openmp/libomptarget/test/lit.site.cfg.in
+++ b/openmp/libomptarget/test/lit.site.cfg.in
@@ -25,6 +25,6 @@ config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
 config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
 config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@
 config.libomptarget_has_libc = @LIBOMPTARGET_GPU_LIBC_SUPPORT@
-
+config.libomptarget_test_pgo = @LIBOMPTARGET_TEST_GPU_PGO@
 # Let the main config do the real work.
 lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/openmp/libomptarget/test/offloading/pgo1.c b/openmp/libomptarget/test/offloading/pgo1.c
new file mode 100644
index 00000000000000..ca8a6f502a06aa
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/pgo1.c
@@ -0,0 +1,39 @@
+// RUN: %libomptarget-compile-generic -fprofile-instr-generate -Xclang "-fprofile-instrument=clang"
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
+
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// REQUIRES: pgo
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+int test1(int a) { return a / 2; }
+int test2(int a) { return a * 2; }
+
+int main() {
+  int m = 2;
+#pragma omp target
+  for (int i = 0; i < 10; i++) {
+    m = test1(m);
+    for (int j = 0; j < 2; j++) {
+      m = test2(m);
+    }
+  }
+}
+
+// CHECK: ======== Counters =========
+// CHECK-NEXT: [ 0 11 20 ]
+// CHECK-NEXT: [ 10 ]
+// CHECK-NEXT: [ 20 ]
+// CHECK-NEXT: ========== Data ===========
+// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// CHECK-NEXT: ======== Functions ========
+// CHECK-NEXT: pgo1.c:__omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}
+// CHECK-NEXT: test1
+// CHECK-NEXT: test2

>From c7ae2a74daa93b05058fcc9bba64e0734359362c Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 17 Jan 2024 23:12:27 -0600
Subject: [PATCH 11/38] Fix PGO test formatting

---
 openmp/libomptarget/test/offloading/pgo1.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/openmp/libomptarget/test/offloading/pgo1.c b/openmp/libomptarget/test/offloading/pgo1.c
index ca8a6f502a06aa..389be19b670d76 100644
--- a/openmp/libomptarget/test/offloading/pgo1.c
+++ b/openmp/libomptarget/test/offloading/pgo1.c
@@ -1,4 +1,5 @@
-// RUN: %libomptarget-compile-generic -fprofile-instr-generate -Xclang "-fprofile-instrument=clang"
+// RUN: %libomptarget-compile-generic -fprofile-instr-generate \
+// RUN:     -Xclang "-fprofile-instrument=clang"
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 
 // UNSUPPORTED: x86_64-pc-linux-gnu
@@ -30,9 +31,18 @@ int main() {
 // CHECK-NEXT: [ 10 ]
 // CHECK-NEXT: [ 20 ]
 // CHECK-NEXT: ========== Data ===========
-// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}}
+// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}}
+// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}}
+// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
 // CHECK-NEXT: ======== Functions ========
 // CHECK-NEXT: pgo1.c:__omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}
 // CHECK-NEXT: test1

>From 8bb22072914bbb830e2788d117aedd0e0bab66ff Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Thu, 18 Jan 2024 23:15:55 -0600
Subject: [PATCH 12/38] Refactor visibility logic

---
 llvm/lib/ProfileData/InstrProf.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 511571a3eed9b0..708ea63fd95e04 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -422,6 +422,16 @@ bool isGPUProfTarget(const Module &M) {
   return Triple.isAMDGPU() || Triple.isNVPTX();
 }
 
+void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) {
+  // If the target is a GPU, make the symbol protected so it can
+  // be read from the host device
+  if (isGPUProfTarget(M))
+    FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility);
+  // Hide the symbol so that we correctly get a copy for each executable.
+  else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
+    FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
+}
+
 GlobalVariable *createPGOFuncNameVar(Module &M,
                                      GlobalValue::LinkageTypes Linkage,
                                      StringRef PGOFuncName) {
@@ -445,14 +455,7 @@ GlobalVariable *createPGOFuncNameVar(Module &M,
       new GlobalVariable(M, Value->getType(), true, Linkage, Value,
                          getPGOFuncNameVarName(PGOFuncName, Linkage));
 
-  // If the target is a GPU, make the symbol protected so it can
-  // be read from the host device
-  if (isGPUProfTarget(M))
-    FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility);
-  // Hide the symbol so that we correctly get a copy for each executable.
-  else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
-    FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
-
+  setPGOFuncVisibility(M, FuncNameVar);
   return FuncNameVar;
 }
 

>From 9f13943f64cb16162e44902d54de53a9b1229179 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Tue, 23 Jan 2024 18:33:58 -0600
Subject: [PATCH 13/38] Add LLVM instrumentation support

This PR formerly only supported -fprofile-instrument=clang. This commit adds support for -fprofile-instrument=llvm
---
 .../Instrumentation/PGOInstrumentation.cpp    | 12 +++-
 openmp/libomptarget/test/offloading/pgo1.c    | 72 +++++++++++++------
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index c20fc942eaf0d5..bbc8da78fd7baf 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -862,6 +862,10 @@ static void instrumentOneFunc(
   auto Name = FuncInfo.FuncNameVar;
   auto CFGHash = ConstantInt::get(Type::getInt64Ty(M->getContext()),
                                   FuncInfo.FunctionHash);
+  // Make sure that pointer to global is passed in with zero addrspace
+  // This is relevant during GPU profiling
+  auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+      Name, llvm::PointerType::getUnqual(M->getContext()));
   if (PGOFunctionEntryCoverage) {
     auto &EntryBB = F.getEntryBlock();
     IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
@@ -869,7 +873,7 @@ static void instrumentOneFunc(
     //                      i32 <index>)
     Builder.CreateCall(
         Intrinsic::getDeclaration(M, Intrinsic::instrprof_cover),
-        {Name, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
+        {NormalizedPtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
     return;
   }
 
@@ -887,7 +891,8 @@ static void instrumentOneFunc(
     //                          i32 <index>)
     Builder.CreateCall(
         Intrinsic::getDeclaration(M, Intrinsic::instrprof_timestamp),
-        {Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I)});
+        {NormalizedPtr, CFGHash, Builder.getInt32(NumCounters),
+         Builder.getInt32(I)});
     I += PGOBlockCoverage ? 8 : 1;
   }
 
@@ -901,7 +906,8 @@ static void instrumentOneFunc(
         Intrinsic::getDeclaration(M, PGOBlockCoverage
                                          ? Intrinsic::instrprof_cover
                                          : Intrinsic::instrprof_increment),
-        {Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I++)});
+        {NormalizedPtr, CFGHash, Builder.getInt32(NumCounters),
+         Builder.getInt32(I++)});
   }
 
   // Now instrument select instructions:
diff --git a/openmp/libomptarget/test/offloading/pgo1.c b/openmp/libomptarget/test/offloading/pgo1.c
index 389be19b670d76..d95793b508dcfc 100644
--- a/openmp/libomptarget/test/offloading/pgo1.c
+++ b/openmp/libomptarget/test/offloading/pgo1.c
@@ -1,6 +1,11 @@
 // RUN: %libomptarget-compile-generic -fprofile-instr-generate \
 // RUN:     -Xclang "-fprofile-instrument=clang"
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \
+// RUN:     --check-prefix="CLANG-PGO"
+// RUN: %libomptarget-compile-generic -fprofile-generate \
+// RUN:     -Xclang "-fprofile-instrument=llvm"
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \
+// RUN:     --check-prefix="LLVM-PGO"
 
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
@@ -26,24 +31,47 @@ int main() {
   }
 }
 
-// CHECK: ======== Counters =========
-// CHECK-NEXT: [ 0 11 20 ]
-// CHECK-NEXT: [ 10 ]
-// CHECK-NEXT: [ 20 ]
-// CHECK-NEXT: ========== Data ===========
-// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}}
-// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}}
-// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}}
-// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// CHECK-NEXT: ======== Functions ========
-// CHECK-NEXT: pgo1.c:__omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}
-// CHECK-NEXT: test1
-// CHECK-NEXT: test2
+// CLANG-PGO: ======== Counters =========
+// CLANG-PGO-NEXT: [ 0 11 20 ]
+// CLANG-PGO-NEXT: [ 10 ]
+// CLANG-PGO-NEXT: [ 20 ]
+// CLANG-PGO-NEXT: ========== Data ===========
+// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
+// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
+// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
+// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// CLANG-PGO-NEXT: ======== Functions ========
+// CLANG-PGO-NEXT: pgo1.c:
+// CLANG-PGO-SAME: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}
+// CLANG-PGO-NEXT: test1
+// CLANG-PGO-NEXT: test2
+
+// LLVM-PGO: ======== Counters =========
+// LLVM-PGO-NEXT: [ 20 ]
+// LLVM-PGO-NEXT: [ 10 ]
+// LLVM-PGO-NEXT: [ 20 10 1 1 ]
+// LLVM-PGO-NEXT: ========== Data ===========
+// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
+// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
+// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
+// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
+// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
+// LLVM-PGO-NEXT: ======== Functions ========
+// LLVM-PGO-NEXT: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}
+// LLVM-PGO-NEXT: test1
+// LLVM-PGO-NEXT: test2

>From 0606f0dd1b32ef9ebe138bbc964b3921e22d95d1 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 14 Feb 2024 01:46:55 -0600
Subject: [PATCH 14/38] Use explicit addrspace instead of unqual

---
 clang/lib/CodeGen/CodeGenPGO.cpp                           | 2 +-
 llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index e084dda879cbc0..4c75a01222d304 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -1103,7 +1103,7 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
   auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-      FuncNameVar, llvm::PointerType::getUnqual(CGM.getLLVMContext()));
+      FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0));
 
   llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash),
                          Builder.getInt32(NumRegionCounters),
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index bbc8da78fd7baf..c63b3e4ecf786a 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -865,7 +865,7 @@ static void instrumentOneFunc(
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
   auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-      Name, llvm::PointerType::getUnqual(M->getContext()));
+      Name, llvm::PointerType::get(M->getContext(), 0));
   if (PGOFunctionEntryCoverage) {
     auto &EntryBB = F.getEntryBlock();
     IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());

>From c1f9be321678766525141214aaab74636cafbc2c Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Thu, 15 Feb 2024 19:10:09 -0600
Subject: [PATCH 15/38] Remove redundant namespaces

---
 .../Instrumentation/PGOInstrumentation.cpp    |  4 +--
 .../common/src/GlobalHandler.cpp              | 26 +++++++++----------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index c63b3e4ecf786a..3058e577738fda 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -864,8 +864,8 @@ static void instrumentOneFunc(
                                   FuncInfo.FunctionHash);
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-      Name, llvm::PointerType::get(M->getContext(), 0));
+  auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+      Name, PointerType::get(M->getContext(), 0));
   if (PGOFunctionEntryCoverage) {
     auto &EntryBB = F.getEntryBlock();
     IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index e5eb653d022287..ae270c60804d26 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -219,30 +219,30 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
 }
 
 void GPUProfGlobals::dump() const {
-  llvm::outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str()
+  outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str()
                << "\n";
 
-  llvm::outs() << "======== Counters =========\n";
+  outs() << "======== Counters =========\n";
   for (const auto &Count : Counts) {
-    llvm::outs() << "[";
+    outs() << "[";
     for (size_t i = 0; i < Count.size(); i++) {
       if (i == 0)
-        llvm::outs() << " ";
-      llvm::outs() << Count[i] << " ";
+        outs() << " ";
+      outs() << Count[i] << " ";
     }
-    llvm::outs() << "]\n";
+    outs() << "]\n";
   }
 
-  llvm::outs() << "========== Data ===========\n";
+  outs() << "========== Data ===========\n";
   for (const auto &ProfData : Data) {
-    llvm::outs() << "{ ";
+    outs() << "{ ";
 #define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer)                     \
-  llvm::outs() << ProfData.Name << " ";
+  outs() << ProfData.Name << " ";
 #include "llvm/ProfileData/InstrProfData.inc"
-    llvm::outs() << "}\n";
+    outs() << "}\n";
   }
 
-  llvm::outs() << "======== Functions ========\n";
+  outs() << "======== Functions ========\n";
   std::string s;
   s.reserve(NamesData.size());
   for (uint8_t Name : NamesData) {
@@ -253,6 +253,6 @@ void GPUProfGlobals::dump() const {
   if (Error Err = Symtab.create(StringRef(s))) {
     consumeError(std::move(Err));
   }
-  Symtab.dumpNames(llvm::outs());
-  llvm::outs() << "===========================\n";
+  Symtab.dumpNames(outs());
+  outs() << "===========================\n";
 }

>From 6a3ae407e69e7524f0f808329c534f8352ee1779 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Thu, 15 Feb 2024 19:15:15 -0600
Subject: [PATCH 16/38] Clang format

---
 .../libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index ae270c60804d26..1fce2448922624 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -220,7 +220,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
 
 void GPUProfGlobals::dump() const {
   outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str()
-               << "\n";
+         << "\n";
 
   outs() << "======== Counters =========\n";
   for (const auto &Count : Counts) {

>From 6866862d459e3c3fa65fae8ae639ddc3ff735252 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 16 Feb 2024 13:13:39 -0600
Subject: [PATCH 17/38] Use getAddrSpaceCast

Replace getPointerBitCastOrAddrSpaceCast with getAddrSpaceCast and allow no-op getAddrSpaceCast calls when types are identical
---
 clang/lib/CodeGen/CodeGenPGO.cpp                           | 2 +-
 llvm/lib/IR/Constants.cpp                                  | 4 ++++
 llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 8f52018445d2b0..baceeba8380ddb 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -1099,7 +1099,7 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
 
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+  auto *NormalizedPtr = llvm::ConstantExpr::getAddrSpaceCast(
       FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0));
 
   llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash),
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index a38b912164b130..2d89c5bbd4a4c2 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -2067,6 +2067,10 @@ Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy,
 
 Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy,
                                          bool OnlyIfReduced) {
+  // Skip cast if types are identical
+  if (C->getType() == DstTy)
+    return C;
+
   assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) &&
          "Invalid constantexpr addrspacecast!");
   return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced);
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 3058e577738fda..c0be71aa4cc004 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -864,7 +864,7 @@ static void instrumentOneFunc(
                                   FuncInfo.FunctionHash);
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+  auto *NormalizedPtr = ConstantExpr::getAddrSpaceCast(
       Name, PointerType::get(M->getContext(), 0));
   if (PGOFunctionEntryCoverage) {
     auto &EntryBB = F.getEntryBlock();

>From 62a5ee1c75545571f81d9edd22e19e9ef7cff69f Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Tue, 27 Feb 2024 14:53:51 -0600
Subject: [PATCH 18/38] Revert "Use getAddrSpaceCast"

This reverts commit 6866862d459e3c3fa65fae8ae639ddc3ff735252.
---
 clang/lib/CodeGen/CodeGenPGO.cpp                           | 2 +-
 llvm/lib/IR/Constants.cpp                                  | 4 ----
 llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp | 2 +-
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index baceeba8380ddb..8f52018445d2b0 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -1099,7 +1099,7 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
 
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *NormalizedPtr = llvm::ConstantExpr::getAddrSpaceCast(
+  auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
       FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0));
 
   llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash),
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 2d89c5bbd4a4c2..a38b912164b130 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -2067,10 +2067,6 @@ Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy,
 
 Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy,
                                          bool OnlyIfReduced) {
-  // Skip cast if types are identical
-  if (C->getType() == DstTy)
-    return C;
-
   assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) &&
          "Invalid constantexpr addrspacecast!");
   return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced);
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index c0be71aa4cc004..3058e577738fda 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -864,7 +864,7 @@ static void instrumentOneFunc(
                                   FuncInfo.FunctionHash);
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *NormalizedPtr = ConstantExpr::getAddrSpaceCast(
+  auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
       Name, PointerType::get(M->getContext(), 0));
   if (PGOFunctionEntryCoverage) {
     auto &EntryBB = F.getEntryBlock();

>From 052394fa28c923d130bf73a07b965a9751467302 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Tue, 27 Feb 2024 15:34:34 -0600
Subject: [PATCH 19/38] Revert "Use getAddrSpaceCast"

This reverts commit 6866862d459e3c3fa65fae8ae639ddc3ff735252.
---
 clang/lib/CodeGen/CodeGenPGO.cpp                           | 2 +-
 llvm/lib/IR/Constants.cpp                                  | 4 ----
 llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp | 2 +-
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index baceeba8380ddb..8f52018445d2b0 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -1099,7 +1099,7 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
 
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *NormalizedPtr = llvm::ConstantExpr::getAddrSpaceCast(
+  auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
       FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0));
 
   llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash),
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 2d89c5bbd4a4c2..a38b912164b130 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -2067,10 +2067,6 @@ Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy,
 
 Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy,
                                          bool OnlyIfReduced) {
-  // Skip cast if types are identical
-  if (C->getType() == DstTy)
-    return C;
-
   assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) &&
          "Invalid constantexpr addrspacecast!");
   return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced);
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index c0be71aa4cc004..3058e577738fda 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -864,7 +864,7 @@ static void instrumentOneFunc(
                                   FuncInfo.FunctionHash);
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *NormalizedPtr = ConstantExpr::getAddrSpaceCast(
+  auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
       Name, PointerType::get(M->getContext(), 0));
   if (PGOFunctionEntryCoverage) {
     auto &EntryBB = F.getEntryBlock();

>From 612d5a5f6966a77e82e5591f5aea475fbf886e55 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 1 Mar 2024 02:04:00 -0600
Subject: [PATCH 20/38] Write PGO

TODO: Fix tests
---
 compiler-rt/lib/profile/InstrProfiling.h      |  11 ++
 compiler-rt/lib/profile/InstrProfilingFile.c  | 148 +++++++++++++++---
 .../common/include/GlobalHandler.h            |  14 +-
 .../common/src/GlobalHandler.cpp              |  57 +++++--
 .../common/src/PluginInterface.cpp            |   6 +-
 5 files changed, 200 insertions(+), 36 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 01239083369187..937acbd417de46 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -275,6 +275,17 @@ void __llvm_profile_get_padding_sizes_for_counters(
  */
 void __llvm_profile_set_dumped();
 
+/*!
+ * \brief Write custom target-specific profiling data to a seperate file.
+ * Used by libomptarget for GPU PGO.
+ */
+int __llvm_write_custom_profile(const char *Target,
+                                const __llvm_profile_data *DataBegin,
+                                const __llvm_profile_data *DataEnd,
+                                const char *CountersBegin,
+                                const char *CountersEnd, const char *NamesBegin,
+                                const char *NamesEnd);
+
 /*!
  * This variable is defined in InstrProfilingRuntime.cpp as a hidden
  * symbol. Its main purpose is to enable profile runtime user to
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index f3b457d786e6bd..4fc401bb9bebf5 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -502,27 +502,15 @@ static FILE *getFileObject(const char *OutputName) {
   return fopen(OutputName, "ab");
 }
 
-/* Write profile data to file \c OutputName.  */
-static int writeFile(const char *OutputName) {
-  int RetVal;
-  FILE *OutputFile;
-
-  int MergeDone = 0;
+/* Get file object and merge if applicable */
+static FILE *getMergeFileObject(const char *OutputName, int *MergeDone) {
   VPMergeHook = &lprofMergeValueProfData;
   if (doMerging())
-    OutputFile = openFileForMerging(OutputName, &MergeDone);
-  else
-    OutputFile = getFileObject(OutputName);
-
-  if (!OutputFile)
-    return -1;
-
-  FreeHook = &free;
-  setupIOBuffer();
-  ProfDataWriter fileWriter;
-  initFileWriter(&fileWriter, OutputFile);
-  RetVal = lprofWriteData(&fileWriter, lprofGetVPDataReader(), MergeDone);
+    return openFileForMerging(OutputName, MergeDone);
+  return getFileObject(OutputName);
+}
 
+static void closeFileObject(FILE *OutputFile) {
   if (OutputFile == getProfileFile()) {
     fflush(OutputFile);
     if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) {
@@ -531,7 +519,23 @@ static int writeFile(const char *OutputName) {
   } else {
     fclose(OutputFile);
   }
+}
+
+/* Write profile data to file \c OutputName.  */
+static int writeFile(const char *OutputName) {
+  int RetVal, MergeDone = 0;
+  FILE *OutputFile = getMergeFileObject(OutputName, &MergeDone);
+
+  if (!OutputFile)
+    return -1;
+
+  FreeHook = &free;
+  setupIOBuffer();
+  ProfDataWriter fileWriter;
+  initFileWriter(&fileWriter, OutputFile);
+  RetVal = lprofWriteData(&fileWriter, lprofGetVPDataReader(), MergeDone);
 
+  closeFileObject(OutputFile);
   return RetVal;
 }
 
@@ -558,10 +562,16 @@ static int writeOrderFile(const char *OutputName) {
 
 #define LPROF_INIT_ONCE_ENV "__LLVM_PROFILE_RT_INIT_ONCE"
 
+static void forceTruncateFile(const char *Filename) {
+  FILE *File = fopen(Filename, "w");
+  if (!File)
+    return;
+  fclose(File);
+}
+
 static void truncateCurrentFile(void) {
   const char *Filename;
   char *FilenameBuf;
-  FILE *File;
   int Length;
 
   Length = getCurFilenameLength();
@@ -591,10 +601,7 @@ static void truncateCurrentFile(void) {
     return;
 
   /* Truncate the file.  Later we'll reopen and append. */
-  File = fopen(Filename, "w");
-  if (!File)
-    return;
-  fclose(File);
+  forceTruncateFile(Filename);
 }
 
 /* Write a partial profile to \p Filename, which is required to be backed by
@@ -1271,4 +1278,99 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File,
   return 0;
 }
 
+int __llvm_write_custom_profile(const char *Target,
+                                const __llvm_profile_data *DataBegin,
+                                const __llvm_profile_data *DataEnd,
+                                const char *CountersBegin,
+                                const char *CountersEnd, const char *NamesBegin,
+                                const char *NamesEnd) {
+  int ReturnValue = 0, FilenameLength, TargetLength, MergeDone;
+  char *FilenameBuf, *TargetFilename;
+  const char *Filename;
+
+  /* Save old profile data */
+  FILE *oldFile = getProfileFile();
+
+  // Temporarily suspend getting SIGKILL when the parent exits.
+  int PDeathSig = lprofSuspendSigKill();
+
+  if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) {
+    PROF_NOTE("Profile data not written to file: %s.\n", "already written");
+    if (PDeathSig == 1)
+      lprofRestoreSigKill();
+    return 0;
+  }
+
+  /* Get current filename */
+  FilenameLength = getCurFilenameLength();
+  FilenameBuf = (char *)COMPILER_RT_ALLOCA(FilenameLength + 1);
+  Filename = getCurFilename(FilenameBuf, 0);
+
+  /* Check the filename. */
+  if (!Filename) {
+    PROF_ERR("Failed to write file : %s\n", "Filename not set");
+    if (PDeathSig == 1)
+      lprofRestoreSigKill();
+    return -1;
+  }
+
+  /* Allocate new space for our target-specific PGO filename */
+  TargetLength = strlen(Target);
+  TargetFilename =
+      (char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2);
+
+  /* Prepend "TARGET." to current filename */
+  memcpy(TargetFilename, Target, TargetLength);
+  TargetFilename[TargetLength] = '.';
+  memcpy(TargetFilename, Target, TargetLength);
+  memcpy(TargetFilename + 1 + TargetLength, Filename, FilenameLength);
+  TargetFilename[FilenameLength + 1 + TargetLength] = 0;
+
+  /* Check if there is llvm/runtime version mismatch.  */
+  if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) {
+    PROF_ERR("Runtime and instrumentation version mismatch : "
+             "expected %d, but get %d\n",
+             INSTR_PROF_RAW_VERSION,
+             (int)GET_VERSION(__llvm_profile_get_version()));
+    if (PDeathSig == 1)
+      lprofRestoreSigKill();
+    return -1;
+  }
+
+  /* Clean old target file */
+  forceTruncateFile(TargetFilename);
+
+  /* Open target-specific PGO file */
+  MergeDone = 0;
+  FILE *OutputFile = getMergeFileObject(TargetFilename, &MergeDone);
+
+  if (!OutputFile) {
+    PROF_ERR("Failed to open file : %s\n", TargetFilename);
+    if (PDeathSig == 1)
+      lprofRestoreSigKill();
+    return -1;
+  }
+
+  FreeHook = &free;
+  setupIOBuffer();
+  ProfDataWriter fileWriter;
+  initFileWriter(&fileWriter, OutputFile);
+
+  /* Write custom data to the file */
+  ReturnValue = lprofWriteDataImpl(
+      &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL,
+      lprofGetVPDataReader(), NamesBegin, NamesEnd, MergeDone);
+
+  closeFileObject(OutputFile);
+
+  // Restore SIGKILL.
+  if (PDeathSig == 1)
+    lprofRestoreSigKill();
+
+  /* Restore old profiling file */
+  setProfileFile(oldFile);
+
+  return ReturnValue;
+}
+
 #endif
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index f5a15ca11bfcda..af0cd4dcdf5dcf 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -63,14 +63,24 @@ struct __llvm_profile_data {
 #include "llvm/ProfileData/InstrProfData.inc"
 };
 
+extern "C" {
+extern int __attribute__((weak))
+__llvm_write_custom_profile(const char *Target,
+                            const __llvm_profile_data *DataBegin,
+                            const __llvm_profile_data *DataEnd,
+                            const char *CountersBegin, const char *CountersEnd,
+                            const char *NamesBegin, const char *NamesEnd);
+}
+
 /// PGO profiling data extracted from a GPU device
 struct GPUProfGlobals {
-  SmallVector<uint8_t> NamesData;
-  SmallVector<SmallVector<int64_t>> Counts;
+  SmallVector<int64_t> Counts;
   SmallVector<__llvm_profile_data> Data;
+  SmallVector<uint8_t> NamesData;
   Triple TargetTriple;
 
   void dump() const;
+  Error write() const;
 };
 
 /// Subclass of GlobalTy that holds the memory for a global of \p Ty.
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index 1fce2448922624..2f16b6e3c139e9 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -205,7 +205,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
       GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data());
       if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
         return Err;
-      DeviceProfileData.Counts.push_back(std::move(Counts));
+      DeviceProfileData.Counts.append(std::move(Counts));
     } else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) {
       // Read profiling data for this global variable
       __llvm_profile_data Data{};
@@ -223,15 +223,14 @@ void GPUProfGlobals::dump() const {
          << "\n";
 
   outs() << "======== Counters =========\n";
-  for (const auto &Count : Counts) {
-    outs() << "[";
-    for (size_t i = 0; i < Count.size(); i++) {
-      if (i == 0)
-        outs() << " ";
-      outs() << Count[i] << " ";
-    }
-    outs() << "]\n";
+  for (size_t i = 0; i < Counts.size(); i++) {
+    if (i > 0 && i % 10 == 0)
+      outs() << "\n";
+    else if (i != 0)
+      outs() << " ";
+    outs() << Counts[i];
   }
+  outs() << "\n";
 
   outs() << "========== Data ===========\n";
   for (const auto &ProfData : Data) {
@@ -256,3 +255,43 @@ void GPUProfGlobals::dump() const {
   Symtab.dumpNames(outs());
   outs() << "===========================\n";
 }
+
+Error GPUProfGlobals::write() const {
+  if (!__llvm_write_custom_profile)
+    return Plugin::error("Could not find symbol __llvm_write_custom_profile. "
+                         "The compiler-rt profiling library must be linked for "
+                         "GPU PGO to work.");
+
+  size_t DataSize = Data.size() * sizeof(__llvm_profile_data),
+         CountsSize = Counts.size() * sizeof(int64_t);
+  __llvm_profile_data *DataBegin, *DataEnd;
+  char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd;
+
+  // Initialize array of contiguous data. We need to make sure each section is
+  // contiguous so that the PGO library can compute deltas properly
+  SmallVector<uint8_t> ContiguousData(NamesData.size() + DataSize + CountsSize);
+
+  // Compute region pointers
+  DataBegin = (__llvm_profile_data *)(ContiguousData.data() + CountsSize);
+  DataEnd =
+      (__llvm_profile_data *)(ContiguousData.data() + CountsSize + DataSize);
+  CountersBegin = (char *)ContiguousData.data();
+  CountersEnd = (char *)(ContiguousData.data() + CountsSize);
+  NamesBegin = (char *)(ContiguousData.data() + CountsSize + DataSize);
+  NamesEnd = (char *)(ContiguousData.data() + CountsSize + DataSize +
+                      NamesData.size());
+
+  // Copy data to contiguous buffer
+  memcpy(DataBegin, Data.data(), DataSize);
+  memcpy(CountersBegin, Counts.data(), CountsSize);
+  memcpy(NamesBegin, NamesData.data(), NamesData.size());
+
+  // Invoke compiler-rt entrypoint
+  int result = __llvm_write_custom_profile(TargetTriple.str().c_str(),
+                                           DataBegin, DataEnd, CountersBegin,
+                                           CountersEnd, NamesBegin, NamesEnd);
+  if (result != 0)
+    return Plugin::error("Error writing GPU PGO data to file");
+
+  return Plugin::success();
+}
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 1ea93795ce8ce4..d5e6b6128152dc 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -837,8 +837,10 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
     if (!ProfOrErr)
       return ProfOrErr.takeError();
 
-    // TODO: write data to profiling file
-    ProfOrErr->dump();
+    // Write data to profiling file
+    if (auto Err = ProfOrErr->write()) {
+      consumeError(std::move(Err));
+    }
   }
 
   // Delete the memory manager before deinitializing the device. Otherwise,

>From b8c916305acf08c0bd2d51b81875be5e8fc59ff3 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 13 Mar 2024 20:05:32 -0500
Subject: [PATCH 21/38] Fix tests

---
 .../plugins-nextgen/common/src/PluginInterface.cpp        | 3 +++
 openmp/libomptarget/test/offloading/pgo1.c                | 8 ++------
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index d5e6b6128152dc..2359ad28a25b04 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -837,6 +837,9 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
     if (!ProfOrErr)
       return ProfOrErr.takeError();
 
+    // Dump out profdata
+    ProfOrErr->dump();
+
     // Write data to profiling file
     if (auto Err = ProfOrErr->write()) {
       consumeError(std::move(Err));
diff --git a/openmp/libomptarget/test/offloading/pgo1.c b/openmp/libomptarget/test/offloading/pgo1.c
index d95793b508dcfc..79e93d0f10827f 100644
--- a/openmp/libomptarget/test/offloading/pgo1.c
+++ b/openmp/libomptarget/test/offloading/pgo1.c
@@ -32,9 +32,7 @@ int main() {
 }
 
 // CLANG-PGO: ======== Counters =========
-// CLANG-PGO-NEXT: [ 0 11 20 ]
-// CLANG-PGO-NEXT: [ 10 ]
-// CLANG-PGO-NEXT: [ 20 ]
+// CLANG-PGO-NEXT: 0 11 20 10 20
 // CLANG-PGO-NEXT: ========== Data ===========
 // CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
 // CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
@@ -55,9 +53,7 @@ int main() {
 // CLANG-PGO-NEXT: test2
 
 // LLVM-PGO: ======== Counters =========
-// LLVM-PGO-NEXT: [ 20 ]
-// LLVM-PGO-NEXT: [ 10 ]
-// LLVM-PGO-NEXT: [ 20 10 1 1 ]
+// LLVM-PGO-NEXT: 20 10 20 10 1 1
 // LLVM-PGO-NEXT: ========== Data ===========
 // LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
 // LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}

>From 7770b37a5a4c40bd45887f762bd7f1e652bc0ed2 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Tue, 7 May 2024 16:31:48 -0500
Subject: [PATCH 22/38] Fix params

---
 compiler-rt/lib/profile/InstrProfilingFile.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 466bfe480543bc..bc1d40a37a5ad6 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -1360,9 +1360,10 @@ int __llvm_write_custom_profile(const char *Target,
   initFileWriter(&fileWriter, OutputFile);
 
   /* Write custom data to the file */
-  ReturnValue = lprofWriteDataImpl(
-      &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL,
-      lprofGetVPDataReader(), NamesBegin, NamesEnd, MergeDone);
+  ReturnValue =
+      lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, CountersBegin,
+                         CountersEnd, NULL, NULL, lprofGetVPDataReader(), NULL,
+                         NULL, NULL, NULL, NamesBegin, NamesEnd, MergeDone);
 
   closeFileObject(OutputFile);
 

>From aa895a1788969a0d27692057a1457074e9772c78 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Mon, 18 Mar 2024 21:31:32 -0500
Subject: [PATCH 23/38] Fix elf obj file

---
 offload/plugins-nextgen/common/src/GlobalHandler.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index 80cdcaff75528e..7717e19a5b6779 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -177,16 +177,19 @@ Expected<GPUProfGlobals>
 GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
                                              DeviceImageTy &Image) {
   GPUProfGlobals DeviceProfileData;
-  auto ELFObj = getELFObjectFile(Image);
-  if (!ELFObj)
-    return ELFObj.takeError();
+  auto ObjFile = getELFObjectFile(Image);
+  if (!ObjFile)
+    return ObjFile.takeError();
+
+  std::unique_ptr<ELFObjectFileBase> ELFObj(
+      static_cast<ELFObjectFileBase *>(ObjFile->release()));
   DeviceProfileData.TargetTriple = ELFObj->makeTriple();
 
   // Iterate through elf symbols
   for (auto &Sym : ELFObj->symbols()) {
     auto NameOrErr = Sym.getName();
     if (!NameOrErr)
-      return ELFObj.takeError();
+      return NameOrErr.takeError();
 
     // Check if given current global is a profiling global based
     // on name

>From 2031e49c2b26864f2dab72e629eb6cbe34928a7a Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Mon, 6 May 2024 23:13:58 -0500
Subject: [PATCH 24/38] Add more addrspace casts for GPU targets

---
 .../Transforms/Instrumentation/InstrProfiling.cpp   | 11 ++++++++---
 .../Instrumentation/PGOInstrumentation.cpp          | 13 +++++++++----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index a6b1e0d488120a..dd8c027c4bbf62 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -869,6 +869,8 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
                       llvm::InstrProfValueKind::IPVK_MemOPSize);
   CallInst *Call = nullptr;
   auto *TLI = &GetTLI(*Ind->getFunction());
+  auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+      DataVar, PointerType::getUnqual(M.getContext()));
 
   // To support value profiling calls within Windows exception handlers, funclet
   // information contained within operand bundles needs to be copied over to
@@ -877,11 +879,13 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   SmallVector<OperandBundleDef, 1> OpBundles;
   Ind->getOperandBundlesAsDefs(OpBundles);
   if (!IsMemOpSize) {
-    Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)};
+    Value *Args[3] = {Ind->getTargetValue(), NormalizedPtr,
+                      Builder.getInt32(Index)};
     Call = Builder.CreateCall(getOrInsertValueProfilingCall(M, *TLI), Args,
                               OpBundles);
   } else {
-    Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)};
+    Value *Args[3] = {Ind->getTargetValue(), NormalizedPtr,
+                      Builder.getInt32(Index)};
     Call = Builder.CreateCall(
         getOrInsertValueProfilingCall(M, *TLI, ValueProfilingCallType::MemOp),
         Args, OpBundles);
@@ -1575,7 +1579,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
         getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
     ValuesVar->setAlignment(Align(8));
     maybeSetComdat(ValuesVar, Fn, CntsVarName);
-    ValuesPtrExpr = ValuesVar;
+    ValuesPtrExpr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+        ValuesVar, PointerType::getUnqual(Fn->getContext()));
   }
 
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 4b51396a8baa35..ee1657ba8400ee 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1007,12 +1007,15 @@ static void instrumentOneFunc(
         ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
       assert(ToProfile && "value profiling Value is of unexpected type");
 
+      auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+          Name, PointerType::get(M->getContext(), 0));
+
       SmallVector<OperandBundleDef, 1> OpBundles;
       populateEHOperandBundle(Cand, BlockColors, OpBundles);
       Builder.CreateCall(
           Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
-          {FuncInfo.FuncNameVar, Builder.getInt64(FuncInfo.FunctionHash),
-           ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
+          {NormalizedPtr, Builder.getInt64(FuncInfo.FunctionHash), ToProfile,
+           Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
           OpBundles);
     }
   } // IPVK_First <= Kind <= IPVK_Last
@@ -1685,10 +1688,12 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
   IRBuilder<> Builder(&SI);
   Type *Int64Ty = Builder.getInt64Ty();
   auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
+  auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+      FuncNameVar, PointerType::get(M->getContext(), 0));
   Builder.CreateCall(
       Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
-      {FuncNameVar, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
-       Builder.getInt32(*CurCtrIdx), Step});
+      {NormalizedPtr, Builder.getInt64(FuncHash),
+       Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
   ++(*CurCtrIdx);
 }
 

>From be6524bb4f77de0add1e698f68115fd336f32238 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Mon, 13 May 2024 17:41:00 -0500
Subject: [PATCH 25/38]  Have test read from profraw instead of dump

---
 offload/test/lit.cfg           |  2 +
 offload/test/offloading/pgo1.c | 94 ++++++++++++++++------------------
 2 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 069110dc69a6e4..38e6a33b01fafc 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -391,6 +391,8 @@ if config.test_fortran_compiler:
     config.available_features.add('flang')
     config.substitutions.append(("%flang", config.test_fortran_compiler))
 
+config.substitutions.append(("%target_triple", config.libomptarget_current_target))
+
 config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
 if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path:
     config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))
diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c
index 79e93d0f10827f..d22d5340f5b3ec 100644
--- a/offload/test/offloading/pgo1.c
+++ b/offload/test/offloading/pgo1.c
@@ -1,22 +1,21 @@
-// RUN: %libomptarget-compile-generic -fprofile-instr-generate \
-// RUN:     -Xclang "-fprofile-instrument=clang"
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \
-// RUN:     --check-prefix="CLANG-PGO"
-// RUN: %libomptarget-compile-generic -fprofile-generate \
-// RUN:     -Xclang "-fprofile-instrument=llvm"
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \
+// RUN: %libomptarget-compile-generic -Xclang "-fprofile-instrument=llvm"
+// RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1
+// RUN: llvm-profdata show --all-functions --counts \
+// RUN:     %target_triple.llvm.profraw | %fcheck-generic \
 // RUN:     --check-prefix="LLVM-PGO"
 
+// RUN: %libomptarget-compile-generic -Xclang "-fprofile-instrument=clang"
+// RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1
+// RUN: llvm-profdata show --all-functions --counts \
+// RUN:     %target_triple.clang.profraw | %fcheck-generic \
+// RUN:     --check-prefix="CLANG-PGO"
+
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // REQUIRES: pgo
 
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
 int test1(int a) { return a / 2; }
 int test2(int a) { return a * 2; }
 
@@ -31,43 +30,38 @@ int main() {
   }
 }
 
-// CLANG-PGO: ======== Counters =========
-// CLANG-PGO-NEXT: 0 11 20 10 20
-// CLANG-PGO-NEXT: ========== Data ===========
-// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
-// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
-// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
-// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// CLANG-PGO-NEXT: ======== Functions ========
-// CLANG-PGO-NEXT: pgo1.c:
-// CLANG-PGO-SAME: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}
-// CLANG-PGO-NEXT: test1
-// CLANG-PGO-NEXT: test2
+// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
+// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// LLVM-PGO: Counters: 4
+// LLVM-PGO: Function count: 20
+// LLVM-PGO: Block counts: [10, 20, 10]
+
+// LLVM-PGO-LABEL: test1:
+// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// LLVM-PGO: Counters: 1
+// LLVM-PGO: Function count: 1
+// LLVM-PGO: Block counts: []
+
+// LLVM-PGO-LABEL: test2:
+// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// LLVM-PGO: Counters: 1
+// LLVM-PGO: Function count: 1
+// LLVM-PGO: Block counts: []
+
+// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
+// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// CLANG-PGO: Counters: 3
+// CLANG-PGO: Function count: 0
+// CLANG-PGO: Block counts: [11, 20]
+
+// CLANG-PGO-LABEL: test1:
+// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// CLANG-PGO: Counters: 1
+// CLANG-PGO: Function count: 10
+// CLANG-PGO: Block counts: []
 
-// LLVM-PGO: ======== Counters =========
-// LLVM-PGO-NEXT: 20 10 20 10 1 1
-// LLVM-PGO-NEXT: ========== Data ===========
-// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
-// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
-// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}}
-// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}}
-// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} }
-// LLVM-PGO-NEXT: ======== Functions ========
-// LLVM-PGO-NEXT: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}
-// LLVM-PGO-NEXT: test1
-// LLVM-PGO-NEXT: test2
+// CLANG-PGO-LABEL: test2:
+// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
+// CLANG-PGO: Counters: 1
+// CLANG-PGO: Function count: 20
+// CLANG-PGO: Block counts: []

>From 2b8eb2935ec21bf0acc5c56f45837b5976560963 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 24 May 2024 19:59:33 -0500
Subject: [PATCH 26/38] Fix PGO test format

---
 offload/test/offloading/pgo1.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c
index d22d5340f5b3ec..0e75c684ed9263 100644
--- a/offload/test/offloading/pgo1.c
+++ b/offload/test/offloading/pgo1.c
@@ -33,20 +33,17 @@ int main() {
 // LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
 // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
 // LLVM-PGO: Counters: 4
-// LLVM-PGO: Function count: 20
-// LLVM-PGO: Block counts: [10, 20, 10]
+// LLVM-PGO: Block counts: [20, 10, 20, 10]
 
 // LLVM-PGO-LABEL: test1:
 // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
 // LLVM-PGO: Counters: 1
-// LLVM-PGO: Function count: 1
-// LLVM-PGO: Block counts: []
+// LLVM-PGO: Block counts: [1]
 
 // LLVM-PGO-LABEL: test2:
 // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
 // LLVM-PGO: Counters: 1
-// LLVM-PGO: Function count: 1
-// LLVM-PGO: Block counts: []
+// LLVM-PGO: Block counts: [1]
 
 // CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
 // CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}

>From 67f3009173d815295f36e2b37e85add1347e3bf9 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 24 May 2024 20:45:04 -0500
Subject: [PATCH 27/38] Refactor profile writer

---
 compiler-rt/lib/profile/InstrProfilingFile.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index bc1d40a37a5ad6..76238214c13aa3 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -1344,8 +1344,7 @@ int __llvm_write_custom_profile(const char *Target,
   forceTruncateFile(TargetFilename);
 
   /* Open target-specific PGO file */
-  MergeDone = 0;
-  FILE *OutputFile = getMergeFileObject(TargetFilename, &MergeDone);
+  FILE *OutputFile = getFileObject(TargetFilename);
 
   if (!OutputFile) {
     PROF_ERR("Failed to open file : %s\n", TargetFilename);
@@ -1356,15 +1355,11 @@ int __llvm_write_custom_profile(const char *Target,
 
   FreeHook = &free;
   setupIOBuffer();
-  ProfDataWriter fileWriter;
-  initFileWriter(&fileWriter, OutputFile);
-
-  /* Write custom data to the file */
-  ReturnValue =
-      lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, CountersBegin,
-                         CountersEnd, NULL, NULL, lprofGetVPDataReader(), NULL,
-                         NULL, NULL, NULL, NamesBegin, NamesEnd, MergeDone);
 
+  /* Write custom data */
+  ReturnValue = __llvm_profile_write_buffer_internal(
+      OutputFile, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL,
+      NamesBegin, NamesEnd);
   closeFileObject(OutputFile);
 
   // Restore SIGKILL.

>From e8ad1322c557f7b48e2b28fe3a34a696a1103bba Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Mon, 27 May 2024 18:29:18 -0500
Subject: [PATCH 28/38] Fix refactor bug

---
 compiler-rt/lib/profile/InstrProfilingFile.c | 52 ++++++++++----------
 offload/test/offloading/pgo1.c               |  6 ++-
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 76238214c13aa3..784cb9af6169d8 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -505,14 +505,6 @@ static FILE *getFileObject(const char *OutputName) {
   return fopen(OutputName, "ab");
 }
 
-/* Get file object and merge if applicable */
-static FILE *getMergeFileObject(const char *OutputName, int *MergeDone) {
-  VPMergeHook = &lprofMergeValueProfData;
-  if (doMerging())
-    return openFileForMerging(OutputName, MergeDone);
-  return getFileObject(OutputName);
-}
-
 static void closeFileObject(FILE *OutputFile) {
   if (OutputFile == getProfileFile()) {
     fflush(OutputFile);
@@ -526,8 +518,15 @@ static void closeFileObject(FILE *OutputFile) {
 
 /* Write profile data to file \c OutputName.  */
 static int writeFile(const char *OutputName) {
-  int RetVal, MergeDone = 0;
-  FILE *OutputFile = getMergeFileObject(OutputName, &MergeDone);
+  int RetVal;
+  FILE *OutputFile;
+
+  int MergeDone = 0;
+  VPMergeHook = &lprofMergeValueProfData;
+  if (doMerging())
+    OutputFile = openFileForMerging(OutputName, &MergeDone);
+  else
+    OutputFile = getFileObject(OutputName);
 
   if (!OutputFile)
     return -1;
@@ -565,16 +564,10 @@ static int writeOrderFile(const char *OutputName) {
 
 #define LPROF_INIT_ONCE_ENV "__LLVM_PROFILE_RT_INIT_ONCE"
 
-static void forceTruncateFile(const char *Filename) {
-  FILE *File = fopen(Filename, "w");
-  if (!File)
-    return;
-  fclose(File);
-}
-
 static void truncateCurrentFile(void) {
   const char *Filename;
   char *FilenameBuf;
+  FILE *File;
   int Length;
 
   Length = getCurFilenameLength();
@@ -604,7 +597,10 @@ static void truncateCurrentFile(void) {
     return;
 
   /* Truncate the file.  Later we'll reopen and append. */
-  forceTruncateFile(Filename);
+  File = fopen(Filename, "w");
+  if (!File)
+    return;
+  fclose(File);
 }
 
 /* Write a partial profile to \p Filename, which is required to be backed by
@@ -1287,7 +1283,7 @@ int __llvm_write_custom_profile(const char *Target,
                                 const char *CountersBegin,
                                 const char *CountersEnd, const char *NamesBegin,
                                 const char *NamesEnd) {
-  int ReturnValue = 0, FilenameLength, TargetLength, MergeDone;
+  int ReturnValue = 0, FilenameLength, TargetLength;
   char *FilenameBuf, *TargetFilename;
   const char *Filename;
 
@@ -1340,11 +1336,9 @@ int __llvm_write_custom_profile(const char *Target,
     return -1;
   }
 
-  /* Clean old target file */
-  forceTruncateFile(TargetFilename);
-
-  /* Open target-specific PGO file */
-  FILE *OutputFile = getFileObject(TargetFilename);
+  /* Open and truncate target-specific PGO file */
+  FILE *OutputFile = fopen(TargetFilename, "w");
+  setProfileFile(OutputFile);
 
   if (!OutputFile) {
     PROF_ERR("Failed to open file : %s\n", TargetFilename);
@@ -1357,9 +1351,13 @@ int __llvm_write_custom_profile(const char *Target,
   setupIOBuffer();
 
   /* Write custom data */
-  ReturnValue = __llvm_profile_write_buffer_internal(
-      OutputFile, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL,
-      NamesBegin, NamesEnd);
+  ProfDataWriter fileWriter;
+  initFileWriter(&fileWriter, OutputFile);
+
+  /* Write custom data to the file */
+  ReturnValue = lprofWriteDataImpl(
+      &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL,
+      lprofGetVPDataReader(), NULL, NULL, NULL, NULL, NamesBegin, NamesEnd, 0);
   closeFileObject(OutputFile);
 
   // Restore SIGKILL.
diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c
index 0e75c684ed9263..d6747113265803 100644
--- a/offload/test/offloading/pgo1.c
+++ b/offload/test/offloading/pgo1.c
@@ -1,10 +1,12 @@
-// RUN: %libomptarget-compile-generic -Xclang "-fprofile-instrument=llvm"
+// RUN: %libomptarget-compile-generic -fprofile-generate \
+// RUN:     -Xclang "-fprofile-instrument=llvm"
 // RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1
 // RUN: llvm-profdata show --all-functions --counts \
 // RUN:     %target_triple.llvm.profraw | %fcheck-generic \
 // RUN:     --check-prefix="LLVM-PGO"
 
-// RUN: %libomptarget-compile-generic -Xclang "-fprofile-instrument=clang"
+// RUN: %libomptarget-compile-generic -fprofile-instr-generate \
+// RUN:     -Xclang "-fprofile-instrument=clang"
 // RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1
 // RUN: llvm-profdata show --all-functions --counts \
 // RUN:     %target_triple.clang.profraw | %fcheck-generic \

>From 4c9f814ce14aeb6766a93f5c1d15b847b98dc29f Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Tue, 28 May 2024 12:58:43 -0500
Subject: [PATCH 29/38] Make requested clang-format change

---
 offload/plugins-nextgen/common/include/GlobalHandler.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index 017d7e994f07a8..1d7b9f80f9dfd3 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -64,12 +64,10 @@ struct __llvm_profile_data {
 };
 
 extern "C" {
-extern int __attribute__((weak))
-__llvm_write_custom_profile(const char *Target,
-                            const __llvm_profile_data *DataBegin,
-                            const __llvm_profile_data *DataEnd,
-                            const char *CountersBegin, const char *CountersEnd,
-                            const char *NamesBegin, const char *NamesEnd);
+extern int __attribute__((weak)) __llvm_write_custom_profile(
+    const char *Target, const __llvm_profile_data *DataBegin,
+    const __llvm_profile_data *DataEnd, const char *CountersBegin,
+    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd);
 }
 
 /// PGO profiling data extracted from a GPU device

>From 344e357de657f54c068be969dcfc3ea33f2f026e Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 31 May 2024 20:29:20 -0500
Subject: [PATCH 30/38] Tighten PGO test requirements

Require compiler-rt to be an enabled runtime
---
 offload/test/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/offload/test/CMakeLists.txt b/offload/test/CMakeLists.txt
index 32df1e47afaeb2..41ab339147791c 100644
--- a/offload/test/CMakeLists.txt
+++ b/offload/test/CMakeLists.txt
@@ -12,10 +12,10 @@ else()
   set(LIBOMPTARGET_DEBUG False)
 endif()
 
-if (OPENMP_STANDALONE_BUILD)
-  set(LIBOMPTARGET_TEST_GPU_PGO False)
-else()
+if (NOT OPENMP_STANDALONE_BUILD AND "compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
   set(LIBOMPTARGET_TEST_GPU_PGO True)
+else()
+  set(LIBOMPTARGET_TEST_GPU_PGO False)
 endif()
 
 # Replace the space from user's input with ";" in case that CMake add escape

>From 2f751420b9ad2ffc7c9fac4a645724b45cdae59a Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 31 May 2024 20:29:20 -0500
Subject: [PATCH 31/38] Tighten PGO test requirements

Require compiler-rt to be an enabled runtime
---
 offload/test/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/offload/test/CMakeLists.txt b/offload/test/CMakeLists.txt
index 32df1e47afaeb2..41ab339147791c 100644
--- a/offload/test/CMakeLists.txt
+++ b/offload/test/CMakeLists.txt
@@ -12,10 +12,10 @@ else()
   set(LIBOMPTARGET_DEBUG False)
 endif()
 
-if (OPENMP_STANDALONE_BUILD)
-  set(LIBOMPTARGET_TEST_GPU_PGO False)
-else()
+if (NOT OPENMP_STANDALONE_BUILD AND "compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
   set(LIBOMPTARGET_TEST_GPU_PGO True)
+else()
+  set(LIBOMPTARGET_TEST_GPU_PGO False)
 endif()
 
 # Replace the space from user's input with ";" in case that CMake add escape

>From 488cb4a349fdfbd73d0a78ddb2c17522c46145ba Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 26 Jun 2024 18:18:31 -0500
Subject: [PATCH 32/38] Apply requested formatting changes

---
 clang/lib/CodeGen/CodeGenPGO.cpp              | 11 +++++-----
 llvm/lib/ProfileData/InstrProf.cpp            |  4 ++--
 .../Instrumentation/InstrProfiling.cpp        | 10 ++++-----
 .../Instrumentation/PGOInstrumentation.cpp    | 21 ++++++++++---------
 offload/DeviceRTL/src/Profiling.cpp           |  6 ++++--
 5 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index a7ce0b8f6a35f3..3edfbdd679c61d 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -1199,12 +1199,13 @@ void CodeGenPGO::emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S,
 
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-      FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0));
+  auto *NormalizedFuncNameVarPtr =
+      llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+          FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0));
 
-  llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash),
-                         Builder.getInt32(NumRegionCounters),
-                         Builder.getInt32(Counter), StepV};
+  llvm::Value *Args[] = {
+      NormalizedFuncNameVarPtr, Builder.getInt64(FunctionHash),
+      Builder.getInt32(NumRegionCounters), Builder.getInt32(Counter), StepV};
 
   if (llvm::EnableSingleByteCoverage)
     Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_cover),
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 1284efd4b5f4da..6742435c9d065e 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -433,8 +433,8 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
 }
 
 bool isGPUProfTarget(const Module &M) {
-  const auto &Triple = llvm::Triple(M.getTargetTriple());
-  return Triple.isAMDGPU() || Triple.isNVPTX();
+  const auto &T = Triple(M.getTargetTriple());
+  return T.isAMDGPU() || T.isNVPTX();
 }
 
 void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) {
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index dd8c027c4bbf62..05cef1236f0879 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -869,8 +869,8 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
                       llvm::InstrProfValueKind::IPVK_MemOPSize);
   CallInst *Call = nullptr;
   auto *TLI = &GetTLI(*Ind->getFunction());
-  auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-      DataVar, PointerType::getUnqual(M.getContext()));
+  auto *NormalizedDataVarPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+      DataVar, PointerType::get(M.getContext(), 0));
 
   // To support value profiling calls within Windows exception handlers, funclet
   // information contained within operand bundles needs to be copied over to
@@ -879,12 +879,12 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   SmallVector<OperandBundleDef, 1> OpBundles;
   Ind->getOperandBundlesAsDefs(OpBundles);
   if (!IsMemOpSize) {
-    Value *Args[3] = {Ind->getTargetValue(), NormalizedPtr,
+    Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr,
                       Builder.getInt32(Index)};
     Call = Builder.CreateCall(getOrInsertValueProfilingCall(M, *TLI), Args,
                               OpBundles);
   } else {
-    Value *Args[3] = {Ind->getTargetValue(), NormalizedPtr,
+    Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr,
                       Builder.getInt32(Index)};
     Call = Builder.CreateCall(
         getOrInsertValueProfilingCall(M, *TLI, ValueProfilingCallType::MemOp),
@@ -1580,7 +1580,7 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
     ValuesVar->setAlignment(Align(8));
     maybeSetComdat(ValuesVar, Fn, CntsVarName);
     ValuesPtrExpr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-        ValuesVar, PointerType::getUnqual(Fn->getContext()));
+        ValuesVar, PointerType::get(Fn->getContext(), 0));
   }
 
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index ee1657ba8400ee..f8f34ea25597f3 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -884,7 +884,7 @@ static void instrumentOneFunc(
                                   FuncInfo.FunctionHash);
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+  auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
       Name, PointerType::get(M->getContext(), 0));
   if (PGOFunctionEntryCoverage) {
     auto &EntryBB = F.getEntryBlock();
@@ -893,7 +893,7 @@ static void instrumentOneFunc(
     //                      i32 <index>)
     Builder.CreateCall(
         Intrinsic::getDeclaration(M, Intrinsic::instrprof_cover),
-        {NormalizedPtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
+        {NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
     return;
   }
 
@@ -948,7 +948,7 @@ static void instrumentOneFunc(
     //                          i32 <index>)
     Builder.CreateCall(
         Intrinsic::getDeclaration(M, Intrinsic::instrprof_timestamp),
-        {NormalizedPtr, CFGHash, Builder.getInt32(NumCounters),
+        {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
          Builder.getInt32(I)});
     I += PGOBlockCoverage ? 8 : 1;
   }
@@ -963,7 +963,7 @@ static void instrumentOneFunc(
         Intrinsic::getDeclaration(M, PGOBlockCoverage
                                          ? Intrinsic::instrprof_cover
                                          : Intrinsic::instrprof_increment),
-        {NormalizedPtr, CFGHash, Builder.getInt32(NumCounters),
+        {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
          Builder.getInt32(I++)});
   }
 
@@ -1007,15 +1007,15 @@ static void instrumentOneFunc(
         ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
       assert(ToProfile && "value profiling Value is of unexpected type");
 
-      auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+      auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           Name, PointerType::get(M->getContext(), 0));
 
       SmallVector<OperandBundleDef, 1> OpBundles;
       populateEHOperandBundle(Cand, BlockColors, OpBundles);
       Builder.CreateCall(
           Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
-          {NormalizedPtr, Builder.getInt64(FuncInfo.FunctionHash), ToProfile,
-           Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
+          {NormalizedNamePtr, Builder.getInt64(FuncInfo.FunctionHash),
+           ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
           OpBundles);
     }
   } // IPVK_First <= Kind <= IPVK_Last
@@ -1688,11 +1688,12 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
   IRBuilder<> Builder(&SI);
   Type *Int64Ty = Builder.getInt64Ty();
   auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
-  auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-      FuncNameVar, PointerType::get(M->getContext(), 0));
+  auto *NormalizedFuncNameVarPtr =
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+          FuncNameVar, PointerType::get(M->getContext(), 0));
   Builder.CreateCall(
       Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
-      {NormalizedPtr, Builder.getInt64(FuncHash),
+      {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
        Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
   ++(*CurCtrIdx);
 }
diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp
index 799477f5e47d27..639c62ceff7a69 100644
--- a/offload/DeviceRTL/src/Profiling.cpp
+++ b/offload/DeviceRTL/src/Profiling.cpp
@@ -12,8 +12,10 @@
 
 extern "C" {
 
-void __llvm_profile_register_function(void *ptr) {}
-void __llvm_profile_register_names_function(void *ptr, long int i) {}
+// Provides empty implementations for certain functions in compiler-rt
+// that are emitted by the PGO instrumentation.
+void __llvm_profile_register_function(void *Ptr) {}
+void __llvm_profile_register_names_function(void *Ptr, long int I) {}
 }
 
 #pragma omp end declare target

>From b90c01583f1893802aba0180b07a448584585365 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 26 Jun 2024 18:29:59 -0500
Subject: [PATCH 33/38] Add memop function shim to DeviceRTL

This comes up sometimes when using LLVM IR level instrumentation.
---
 offload/DeviceRTL/include/Profiling.h | 1 +
 offload/DeviceRTL/src/Profiling.cpp   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/offload/DeviceRTL/include/Profiling.h b/offload/DeviceRTL/include/Profiling.h
index 9efc1554c176bc..d9947522541219 100644
--- a/offload/DeviceRTL/include/Profiling.h
+++ b/offload/DeviceRTL/include/Profiling.h
@@ -15,6 +15,7 @@
 extern "C" {
 void __llvm_profile_register_function(void *Ptr);
 void __llvm_profile_register_names_function(void *Ptr, long int I);
+void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
 }
 
 #endif
diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp
index 639c62ceff7a69..bb3caaadcc03dd 100644
--- a/offload/DeviceRTL/src/Profiling.cpp
+++ b/offload/DeviceRTL/src/Profiling.cpp
@@ -16,6 +16,7 @@ extern "C" {
 // that are emitted by the PGO instrumentation.
 void __llvm_profile_register_function(void *Ptr) {}
 void __llvm_profile_register_names_function(void *Ptr, long int I) {}
+void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
 }
 
 #pragma omp end declare target

>From c68c6e2fa98a1fe608b88ed38f7db68eae804c5b Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Thu, 27 Jun 2024 02:04:27 -0500
Subject: [PATCH 34/38] Make requested changes

---
 compiler-rt/lib/profile/InstrProfiling.h               | 2 +-
 compiler-rt/lib/profile/InstrProfilingFile.c           | 1 -
 offload/plugins-nextgen/common/src/PluginInterface.cpp | 5 ++---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index ef1292a45bf01d..eda3e9a673c1af 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -298,7 +298,7 @@ void __llvm_profile_set_dumped();
 
 /*!
  * \brief Write custom target-specific profiling data to a seperate file.
- * Used by libomptarget for GPU PGO.
+ * Used by offload PGO.
  */
 int __llvm_write_custom_profile(const char *Target,
                                 const __llvm_profile_data *DataBegin,
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 784cb9af6169d8..93436ecbabb40d 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -1321,7 +1321,6 @@ int __llvm_write_custom_profile(const char *Target,
   /* Prepend "TARGET." to current filename */
   memcpy(TargetFilename, Target, TargetLength);
   TargetFilename[TargetLength] = '.';
-  memcpy(TargetFilename, Target, TargetLength);
   memcpy(TargetFilename + 1 + TargetLength, Filename, FilenameLength);
   TargetFilename[FilenameLength + 1 + TargetLength] = 0;
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index c4e1e63777de8a..445f4ad942bd4d 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -843,9 +843,8 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
     ProfOrErr->dump();
 
     // Write data to profiling file
-    if (auto Err = ProfOrErr->write()) {
-      consumeError(std::move(Err));
-    }
+    if (auto Err = ProfOrErr->write())
+      return Err;
   }
 
   // Delete the memory manager before deinitializing the device. Otherwise,

>From ca52c58c7fde412897cf6b10b9bbb321812f193d Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Thu, 27 Jun 2024 02:26:20 -0500
Subject: [PATCH 35/38] Only dump counters if PGODump flag is set

---
 offload/include/Shared/Environment.h                   | 1 +
 offload/plugins-nextgen/common/src/PluginInterface.cpp | 4 +++-
 openmp/docs/design/Runtimes.rst                        | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index d141146b6bd5a1..86f6d1c6ea2d36 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -30,6 +30,7 @@ enum class DeviceDebugKind : uint32_t {
   FunctionTracing = 1U << 1,
   CommonIssues = 1U << 2,
   AllocationTracker = 1U << 3,
+  PGODump = 1U << 4,
 };
 
 struct DeviceEnvironmentTy {
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 445f4ad942bd4d..35fb04863d8741 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -840,7 +840,9 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
       return ProfOrErr.takeError();
 
     // Dump out profdata
-    ProfOrErr->dump();
+    if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
+        uint32_t(DeviceDebugKind::PGODump))
+      ProfOrErr->dump();
 
     // Write data to profiling file
     if (auto Err = ProfOrErr->write())
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index f8a8cb87e83e66..7fc697a838e229 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1493,3 +1493,4 @@ debugging features are supported.
     * Enable debugging assertions in the device. ``0x01``
     * Enable diagnosing common problems during offloading . ``0x4``
     * Enable device malloc statistics (amdgpu only). ``0x8``
+    * Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``

>From ee4431a1b57469c7679f54f124ca5f3dd7f0433b Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 9 Aug 2024 20:21:38 -0500
Subject: [PATCH 36/38] Update requirements

---
 offload/test/offloading/pgo1.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c
index d6747113265803..fbf6337374a997 100644
--- a/offload/test/offloading/pgo1.c
+++ b/offload/test/offloading/pgo1.c
@@ -12,10 +12,7 @@
 // RUN:     %target_triple.clang.profraw | %fcheck-generic \
 // RUN:     --check-prefix="CLANG-PGO"
 
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// REQUIRES: gpu
 // REQUIRES: pgo
 
 int test1(int a) { return a / 2; }

>From fb699b6bca72d42359a304bcbba88f3564ae9ac9 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Sat, 10 Aug 2024 00:54:36 -0500
Subject: [PATCH 37/38] Merge changes

---
 offload/plugins-nextgen/common/src/GlobalHandler.cpp | 2 +-
 offload/test/offloading/pgo1.c                       | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index bca66cff6558a2..d7bfbba01c8efc 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -193,7 +193,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
 
     // Check if given current global is a profiling global based
     // on name
-    if (NameOrErr->equals(getInstrProfNamesVarName())) {
+    if (*NameOrErr == getInstrProfNamesVarName()) {
       // Read in profiled function names
       DeviceProfileData.NamesData = SmallVector<uint8_t>(Sym.getSize(), 0);
       GlobalTy NamesGlobal(NameOrErr->str(), Sym.getSize(),
diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c
index fbf6337374a997..3270ce8f15e7dc 100644
--- a/offload/test/offloading/pgo1.c
+++ b/offload/test/offloading/pgo1.c
@@ -32,17 +32,17 @@ int main() {
 // LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
 // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
 // LLVM-PGO: Counters: 4
-// LLVM-PGO: Block counts: [20, 10, 20, 10]
+// LLVM-PGO: Block counts: [20, 10, 2, 1]
 
 // LLVM-PGO-LABEL: test1:
 // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
 // LLVM-PGO: Counters: 1
-// LLVM-PGO: Block counts: [1]
+// LLVM-PGO: Block counts: [10]
 
 // LLVM-PGO-LABEL: test2:
 // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}}
 // LLVM-PGO: Counters: 1
-// LLVM-PGO: Block counts: [1]
+// LLVM-PGO: Block counts: [20]
 
 // CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}:
 // CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}}

>From 1d0a961aabe488e6d09b96a80329498b8f586923 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 25 Oct 2024 13:42:19 -0500
Subject: [PATCH 38/38] Add llvm-profdata substitution to offload tests

---
 offload/test/lit.cfg           | 2 ++
 offload/test/lit.site.cfg.in   | 2 +-
 offload/test/offloading/pgo1.c | 4 ++--
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 7994a08ba063fb..cfd1ad6c3c1eb5 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -112,8 +112,10 @@ config.available_features.add(config.libomptarget_current_target)
 if config.libomptarget_has_libc:
   config.available_features.add('libc')
 
+profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata")
 if config.libomptarget_test_pgo:
   config.available_features.add('pgo')
+  config.substitutions.append(("%profdata", profdata_path))
 
 # Determine whether the test system supports unified memory.
 # For CUDA, this is the case with compute capability 70 (Volta) or higher.
diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in
index a1cb5acc38a405..d998fb0c839700 100644
--- a/offload/test/lit.site.cfg.in
+++ b/offload/test/lit.site.cfg.in
@@ -1,6 +1,6 @@
 @AUTO_GEN_COMMENT@
 
-config.bin_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
+config.bin_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@"
 config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
 config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
 config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@"
diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c
index 1ef540e430a27a..51671afa62b0db 100644
--- a/offload/test/offloading/pgo1.c
+++ b/offload/test/offloading/pgo1.c
@@ -1,14 +1,14 @@
 // RUN: %libomptarget-compile-generic -fprofile-generate \
 // RUN:     -Xclang "-fprofile-instrument=llvm"
 // RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1
-// RUN: llvm-profdata show --all-functions --counts \
+// RUN: %profdata show --all-functions --counts \
 // RUN:     %target_triple.llvm.profraw | %fcheck-generic \
 // RUN:     --check-prefix="LLVM-PGO"
 
 // RUN: %libomptarget-compile-generic -fprofile-instr-generate \
 // RUN:     -Xclang "-fprofile-instrument=clang"
 // RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1
-// RUN: llvm-profdata show --all-functions --counts \
+// RUN: %profdata show --all-functions --counts \
 // RUN:     %target_triple.clang.profraw | %fcheck-generic \
 // RUN:     --check-prefix="CLANG-PGO"
 



More information about the cfe-commits mailing list