[clang-tools-extra] [openmp] [llvm] [clang] [PGO][OpenMP] Instrumentation for GPU devices (PR #76587)

Ethan Luis McDonough via cfe-commits cfe-commits at lists.llvm.org
Wed Jan 3 15:19:06 PST 2024


https://github.com/EthanLuisMcDonough updated https://github.com/llvm/llvm-project/pull/76587

>From 530eb982b9770190377bb0bd09c5cb715f34d484 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 15 Dec 2023 20:38:38 -0600
Subject: [PATCH 1/8] Add profiling functions to libomptarget

---
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |  3 +++
 openmp/libomptarget/DeviceRTL/CMakeLists.txt  |  2 ++
 .../DeviceRTL/include/Profiling.h             | 21 +++++++++++++++++++
 .../libomptarget/DeviceRTL/src/Profiling.cpp  | 19 +++++++++++++++++
 4 files changed, 45 insertions(+)
 create mode 100644 openmp/libomptarget/DeviceRTL/include/Profiling.h
 create mode 100644 openmp/libomptarget/DeviceRTL/src/Profiling.cpp

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index d22d2a8e948b00..1d887d5cb58127 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -503,6 +503,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
 __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
 
+__OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr)
+__OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64)
+
 __OMP_RTL(__last, false, Void, )
 
 #undef __OMP_RTL
diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
index 1ce3e1e40a80ab..55ee15d068c67b 100644
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -89,6 +89,7 @@ set(include_files
   ${include_directory}/Interface.h
   ${include_directory}/LibC.h
   ${include_directory}/Mapping.h
+  ${include_directory}/Profiling.h
   ${include_directory}/State.h
   ${include_directory}/Synchronization.h
   ${include_directory}/Types.h
@@ -104,6 +105,7 @@ set(src_files
   ${source_directory}/Mapping.cpp
   ${source_directory}/Misc.cpp
   ${source_directory}/Parallelism.cpp
+  ${source_directory}/Profiling.cpp
   ${source_directory}/Reduction.cpp
   ${source_directory}/State.cpp
   ${source_directory}/Synchronization.cpp
diff --git a/openmp/libomptarget/DeviceRTL/include/Profiling.h b/openmp/libomptarget/DeviceRTL/include/Profiling.h
new file mode 100644
index 00000000000000..68c7744cd60752
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Profiling.h
@@ -0,0 +1,21 @@
+//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_PROFILING_H
+#define OMPTARGET_DEVICERTL_PROFILING_H
+
+extern "C" {
+
+void __llvm_profile_register_function(void *ptr);
+void __llvm_profile_register_names_function(void *ptr, long int i);
+}
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/src/Profiling.cpp b/openmp/libomptarget/DeviceRTL/src/Profiling.cpp
new file mode 100644
index 00000000000000..799477f5e47d27
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Profiling.cpp
@@ -0,0 +1,19 @@
+//===------- Profiling.cpp ---------------------------------------- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Profiling.h"
+
+#pragma omp begin declare target device_type(nohost)
+
+extern "C" {
+
+void __llvm_profile_register_function(void *ptr) {}
+void __llvm_profile_register_names_function(void *ptr, long int i) {}
+}
+
+#pragma omp end declare target

>From fb067d4ffe604fd68cf90b705db1942bce49dbb1 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Sat, 16 Dec 2023 01:18:41 -0600
Subject: [PATCH 2/8] Fix PGO instrumentation for GPU targets

---
 clang/lib/CodeGen/CodeGenPGO.cpp                      | 10 ++++++++--
 .../lib/Transforms/Instrumentation/InstrProfiling.cpp | 11 ++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 81bf8ea696b164..edae6885b528ac 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -959,8 +959,14 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
 
   unsigned Counter = (*RegionCounterMap)[S];
 
-  llvm::Value *Args[] = {FuncNameVar,
-                         Builder.getInt64(FunctionHash),
+  // Make sure that pointer to global is passed in with zero addrspace
+  // This is relevant during GPU profiling
+  auto *I8Ty = llvm::Type::getInt8Ty(CGM.getLLVMContext());
+  auto *I8PtrTy = llvm::PointerType::getUnqual(I8Ty);
+  auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+      FuncNameVar, I8PtrTy);
+
+  llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash),
                          Builder.getInt32(NumRegionCounters),
                          Builder.getInt32(Counter), StepV};
   if (!StepV)
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index fe5a0578bd9721..d2cb8155c17967 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1658,10 +1658,13 @@ void InstrLowerer::emitRegistration() {
   IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", RegisterF));
   for (Value *Data : CompilerUsedVars)
     if (!isa<Function>(Data))
-      IRB.CreateCall(RuntimeRegisterF, Data);
+      // Check for addrspace cast when profiling GPU
+      IRB.CreateCall(RuntimeRegisterF,
+                     IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
   for (Value *Data : UsedVars)
     if (Data != NamesVar && !isa<Function>(Data))
-      IRB.CreateCall(RuntimeRegisterF, Data);
+      IRB.CreateCall(RuntimeRegisterF,
+                     IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
 
   if (NamesVar) {
     Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
@@ -1670,7 +1673,9 @@ void InstrLowerer::emitRegistration() {
     auto *NamesRegisterF =
         Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
                          getInstrProfNamesRegFuncName(), M);
-    IRB.CreateCall(NamesRegisterF, {NamesVar, IRB.getInt64(NamesSize)});
+    IRB.CreateCall(NamesRegisterF, {IRB.CreatePointerBitCastOrAddrSpaceCast(
+                                        NamesVar, VoidPtrTy),
+                                    IRB.getInt64(NamesSize)});
   }
 
   IRB.CreateRetVoid();

>From 7a0e0efa178cc4de6a22a8f5cc3f53cd1c81ea3a Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Thu, 21 Dec 2023 00:25:46 -0600
Subject: [PATCH 3/8] Change global visibility on GPU targets

---
 llvm/include/llvm/ProfileData/InstrProf.h       |  4 ++++
 llvm/lib/ProfileData/InstrProf.cpp              | 17 +++++++++++++++--
 .../Instrumentation/InstrProfiling.cpp          | 15 +++++++++++----
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 288dc71d756aee..bf9899d867e3dd 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -171,6 +171,10 @@ inline StringRef getInstrProfCounterBiasVarName() {
 /// Return the marker used to separate PGO names during serialization.
 inline StringRef getInstrProfNameSeparator() { return "\01"; }
 
+/// Determines whether module targets a GPU eligable for PGO
+/// instrumentation
+bool isGPUProfTarget(const Module &M);
+
 /// Return the modified name for function \c F suitable to be
 /// used the key for profile lookup. Variable \c InLTO indicates if this
 /// is called in LTO optimization passes.
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 649d814cfd9de0..0d6717aeb0142c 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -410,13 +410,22 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
   return VarName;
 }
 
+bool isGPUProfTarget(const Module &M) {
+  const auto &triple = M.getTargetTriple();
+  return triple.rfind("nvptx", 0) == 0 || triple.rfind("amdgcn", 0) == 0 ||
+         triple.rfind("r600", 0) == 0;
+}
+
 GlobalVariable *createPGOFuncNameVar(Module &M,
                                      GlobalValue::LinkageTypes Linkage,
                                      StringRef PGOFuncName) {
+  // Ensure profiling variables on GPU are visible to be read from host
+  if (isGPUProfTarget(M))
+    Linkage = GlobalValue::ExternalLinkage;
   // We generally want to match the function's linkage, but available_externally
   // and extern_weak both have the wrong semantics, and anything that doesn't
   // need to link across compilation units doesn't need to be visible at all.
-  if (Linkage == GlobalValue::ExternalWeakLinkage)
+  else if (Linkage == GlobalValue::ExternalWeakLinkage)
     Linkage = GlobalValue::LinkOnceAnyLinkage;
   else if (Linkage == GlobalValue::AvailableExternallyLinkage)
     Linkage = GlobalValue::LinkOnceODRLinkage;
@@ -430,8 +439,12 @@ GlobalVariable *createPGOFuncNameVar(Module &M,
       new GlobalVariable(M, Value->getType(), true, Linkage, Value,
                          getPGOFuncNameVarName(PGOFuncName, Linkage));
 
+  // If the target is a GPU, make the symbol protected so it can
+  // be read from the host device
+  if (isGPUProfTarget(M))
+    FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility);
   // Hide the symbol so that we correctly get a copy for each executable.
-  if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
+  else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
     FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
 
   return FuncNameVar;
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index d2cb8155c17967..3b582b65190808 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1481,6 +1481,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
     Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
 
+  if (isGPUProfTarget(M)) {
+    Linkage = GlobalValue::ExternalLinkage;
+    Visibility = GlobalValue::ProtectedVisibility;
+  }
   // If the data variable is not referenced by code (if we don't emit
   // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the
   // data variable live under linker GC, the data variable can be private. This
@@ -1492,9 +1496,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees
   // that other copies must have the same CFG and cannot have value profiling.
   // If no hash suffix, other profd copies may be referenced by code.
-  if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
-      (TT.isOSBinFormatELF() ||
-       (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
+  else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
+           (TT.isOSBinFormatELF() ||
+            (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
     Linkage = GlobalValue::PrivateLinkage;
     Visibility = GlobalValue::DefaultVisibility;
   }
@@ -1696,7 +1700,10 @@ bool InstrLowerer::emitRuntimeHook() {
   auto *Var =
       new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
                          nullptr, getInstrProfRuntimeHookVarName());
-  Var->setVisibility(GlobalValue::HiddenVisibility);
+  if (isGPUProfTarget(M))
+    Var->setVisibility(GlobalValue::ProtectedVisibility);
+  else
+    Var->setVisibility(GlobalValue::HiddenVisibility);
 
   if (TT.isOSBinFormatELF() && !TT.isPS()) {
     // Mark the user variable as used so that it isn't stripped out.

>From fddc07908ed9aa698fe3250ddbfc5621ab4d049d Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 22 Dec 2023 23:43:29 -0600
Subject: [PATCH 4/8] Make names global public on GPU

---
 llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 3b582b65190808..61fba7be3ee0ee 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1621,6 +1621,13 @@ void InstrLowerer::emitNameData() {
   NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
                                 GlobalValue::PrivateLinkage, NamesVal,
                                 getInstrProfNamesVarName());
+
+  // Make names variable public if current target is a GPU
+  if (isGPUProfTarget(M)) {
+    NamesVar->setLinkage(GlobalValue::ExternalLinkage);
+    NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility);
+  }
+
   NamesSize = CompressedNameStr.size();
   setGlobalVariableLargeSection(TT, *NamesVar);
   NamesVar->setSection(

>From e9db03c70bf79f4f4ddad4b48a5aa63a37e0d4f6 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Fri, 29 Dec 2023 12:54:50 -0600
Subject: [PATCH 5/8] Read and print GPU device PGO globals

---
 .../common/include/GlobalHandler.h            | 27 ++++++
 .../common/src/GlobalHandler.cpp              | 82 +++++++++++++++++++
 .../common/src/PluginInterface.cpp            | 14 ++++
 3 files changed, 123 insertions(+)

diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index fa079ac9660ee0..a82cd536487653 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -14,9 +14,11 @@
 #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
 
 #include <string>
+#include <vector>
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ProfileData/InstrProf.h"
 
 #include "Shared/Debug.h"
 #include "Shared/Utils.h"
@@ -58,6 +60,22 @@ class GlobalTy {
   void setPtr(void *P) { Ptr = P; }
 };
 
+typedef void *IntPtrT;
+struct __llvm_profile_data {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+/// PGO profiling data extracted from a GPU device
+struct GPUProfGlobals {
+  std::string names;
+  std::vector<std::vector<int64_t>> counts;
+  std::vector<__llvm_profile_data> data;
+  Triple targetTriple;
+
+  void dump() const;
+};
+
 /// Subclass of GlobalTy that holds the memory for a global of \p Ty.
 template <typename Ty> class StaticGlobalTy : public GlobalTy {
   Ty Data;
@@ -172,6 +190,15 @@ class GenericGlobalHandlerTy {
     return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
                                           /* D2H */ false);
   }
+
+  /// Checks whether a given image contains profiling globals.
+  bool hasProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image);
+
+  /// Reads profiling data from a GPU image to supplied profdata struct.
+  /// Iterates through the image symbol table and stores global values
+  /// with profiling prefixes.
+  Expected<GPUProfGlobals> readProfilingGlobals(GenericDeviceTy &Device,
+                                                DeviceImageTy &Image);
 };
 
 } // namespace plugin
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index 3a272e228c7dfe..5dd5daec468ca5 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -176,3 +176,85 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
 
   return Plugin::success();
 }
+
+bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device,
+                                                 DeviceImageTy &Image) {
+  GlobalTy global(getInstrProfNamesVarName().str(), 0);
+  if (auto Err = getGlobalMetadataFromImage(Device, Image, global)) {
+    consumeError(std::move(Err));
+    return false;
+  }
+  return true;
+}
+
+Expected<GPUProfGlobals>
+GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
+                                             DeviceImageTy &Image) {
+  GPUProfGlobals profdata;
+  const auto *elf = getOrCreateELFObjectFile(Device, Image);
+  profdata.targetTriple = elf->makeTriple();
+  // Iterate through
+  for (auto &sym : elf->symbols()) {
+    if (auto name = sym.getName()) {
+      // Check if given current global is a profiling global based
+      // on name
+      if (name->equals(getInstrProfNamesVarName())) {
+        // Read in profiled function names
+        std::vector<char> chars(sym.getSize() / sizeof(char), ' ');
+        GlobalTy NamesGlobal(name->str(), sym.getSize(), chars.data());
+        if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal))
+          return Err;
+        std::string names(chars.begin(), chars.end());
+        profdata.names = std::move(names);
+      } else if (name->starts_with(getInstrProfCountersVarPrefix())) {
+        // Read global variable profiling counts
+        std::vector<int64_t> counts(sym.getSize() / sizeof(int64_t), 0);
+        GlobalTy CountGlobal(name->str(), sym.getSize(), counts.data());
+        if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
+          return Err;
+        profdata.counts.push_back(std::move(counts));
+      } else if (name->starts_with(getInstrProfDataVarPrefix())) {
+        // Read profiling data for this global variable
+        __llvm_profile_data data{};
+        GlobalTy DataGlobal(name->str(), sym.getSize(), &data);
+        if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal))
+          return Err;
+        profdata.data.push_back(std::move(data));
+      }
+    }
+  }
+  return profdata;
+}
+
+void GPUProfGlobals::dump() const {
+  llvm::outs() << "======= GPU Profile =======\nTarget: " << targetTriple.str()
+               << "\n";
+
+  llvm::outs() << "======== Counters =========\n";
+  for (const auto &count : counts) {
+    llvm::outs() << "[";
+    for (size_t i = 0; i < count.size(); i++) {
+      if (i == 0)
+        llvm::outs() << " ";
+      llvm::outs() << count[i] << " ";
+    }
+    llvm::outs() << "]\n";
+  }
+
+  llvm::outs() << "========== Data ===========\n";
+  for (const auto &d : data) {
+    llvm::outs() << "{ ";
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer)                     \
+  llvm::outs() << d.Name << " ";
+#include "llvm/ProfileData/InstrProfData.inc"
+    llvm::outs() << " }\n";
+  }
+
+  llvm::outs() << "======== Functions ========\n";
+  InstrProfSymtab symtab;
+  if (Error Err = symtab.create(StringRef(names))) {
+    consumeError(std::move(Err));
+  }
+  symtab.dumpNames(llvm::outs());
+  llvm::outs() << "===========================\n";
+}
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 3c7d1ca8998787..84ed90f03f84f1 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -811,6 +811,20 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
            DeviceMemoryPoolTracking.AllocationMax);
   }
 
+  for (auto *Image : LoadedImages) {
+    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
+    if (!Handler.hasProfilingGlobals(*this, *Image))
+      continue;
+
+    GPUProfGlobals profdata;
+    auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
+    if (!ProfOrErr)
+      return ProfOrErr.takeError();
+
+    // TODO: write data to profiling file
+    ProfOrErr->dump();
+  }
+
   // Delete the memory manager before deinitializing the device. Otherwise,
   // we may delete device allocations after the device is deinitialized.
   if (MemoryManager)

>From e4687605d1a6ca932312025826db09dba84845a3 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 3 Jan 2024 17:06:15 -0600
Subject: [PATCH 6/8] Fix rebase bug

---
 .../plugins-nextgen/common/src/GlobalHandler.cpp       | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index cb71b61f4a9c4f..86742d0f77a2fe 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -178,10 +178,12 @@ Expected<GPUProfGlobals>
 GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
                                              DeviceImageTy &Image) {
   GPUProfGlobals profdata;
-  const auto *elf = getOrCreateELFObjectFile(Device, Image);
-  profdata.targetTriple = elf->makeTriple();
-  // Iterate through
-  for (auto &sym : elf->symbols()) {
+  auto ELFObj = getELFObjectFile(Image);
+  if (!ELFObj)
+    return ELFObj.takeError();
+  profdata.targetTriple = ELFObj->makeTriple();
+  // Iterate through elf symbols
+  for (auto &sym : ELFObj->symbols()) {
     if (auto name = sym.getName()) {
       // Check if given current global is a profiling global based
       // on name

>From ec18ce94c227e1d43927955fa1c67360ecfcfca6 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 3 Jan 2024 17:10:19 -0600
Subject: [PATCH 7/8] Refactor portions to be more idiomatic

---
 clang/lib/CodeGen/CodeGenPGO.cpp   | 4 +---
 llvm/lib/ProfileData/InstrProf.cpp | 5 ++---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index edae6885b528ac..7bfcec43ee4c98 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -961,10 +961,8 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
 
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
-  auto *I8Ty = llvm::Type::getInt8Ty(CGM.getLLVMContext());
-  auto *I8PtrTy = llvm::PointerType::getUnqual(I8Ty);
   auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-      FuncNameVar, I8PtrTy);
+      FuncNameVar, llvm::PointerType::getUnqual(CGM.getLLVMContext()));
 
   llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash),
                          Builder.getInt32(NumRegionCounters),
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index cdcd6840bb5108..1d88da16a5ff9c 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -429,9 +429,8 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
 }
 
 bool isGPUProfTarget(const Module &M) {
-  const auto &triple = M.getTargetTriple();
-  return triple.rfind("nvptx", 0) == 0 || triple.rfind("amdgcn", 0) == 0 ||
-         triple.rfind("r600", 0) == 0;
+  const auto &Triple = llvm::Triple(M.getTargetTriple());
+  return Triple.isAMDGPU() || Triple.isNVPTX();
 }
 
 GlobalVariable *createPGOFuncNameVar(Module &M,

>From 0872556f597056361b0a2c23cdd0be3d9745aef3 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Wed, 3 Jan 2024 17:18:47 -0600
Subject: [PATCH 8/8] Reformat DeviceRTL prof functions

---
 openmp/libomptarget/DeviceRTL/include/Profiling.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/openmp/libomptarget/DeviceRTL/include/Profiling.h b/openmp/libomptarget/DeviceRTL/include/Profiling.h
index 68c7744cd60752..9efc1554c176bc 100644
--- a/openmp/libomptarget/DeviceRTL/include/Profiling.h
+++ b/openmp/libomptarget/DeviceRTL/include/Profiling.h
@@ -13,9 +13,8 @@
 #define OMPTARGET_DEVICERTL_PROFILING_H
 
 extern "C" {
-
-void __llvm_profile_register_function(void *ptr);
-void __llvm_profile_register_names_function(void *ptr, long int i);
+void __llvm_profile_register_function(void *Ptr);
+void __llvm_profile_register_names_function(void *Ptr, long int I);
 }
 
 #endif



More information about the cfe-commits mailing list