[clang-tools-extra] [llvm] [openmp] [clang] [PGO][OpenMP] Instrumentation for GPU devices (PR #76587)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Dec 29 13:38:31 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-pgo
Author: Ethan Luis McDonough (EthanLuisMcDonough)
<details>
<summary>Changes</summary>
This pull request is the first part of an ongoing effort to extends PGO instrumentation to GPU device code. This PR makes the following changes:
- Adds blank registration functions to device RTL
- Gives PGO globals protected visibility when targeting a supported GPU
- Handles any addrspace casts for PGO calls
- Implements PGO global extraction in GPU plugins (currently only dumps info)
These changes can be tested by supplying `-fprofile-instrument=clang` while targeting a GPU.
---
Full diff: https://github.com/llvm/llvm-project/pull/76587.diff
11 Files Affected:
- (modified) clang/lib/CodeGen/CodeGenPGO.cpp (+8-2)
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPKinds.def (+3)
- (modified) llvm/include/llvm/ProfileData/InstrProf.h (+4)
- (modified) llvm/lib/ProfileData/InstrProf.cpp (+15-2)
- (modified) llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp (+26-7)
- (modified) openmp/libomptarget/DeviceRTL/CMakeLists.txt (+2)
- (added) openmp/libomptarget/DeviceRTL/include/Profiling.h (+21)
- (added) openmp/libomptarget/DeviceRTL/src/Profiling.cpp (+19)
- (modified) openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h (+27)
- (modified) openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp (+82)
- (modified) openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp (+14)
``````````diff
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 81bf8ea696b164..edae6885b528ac 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -959,8 +959,14 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
unsigned Counter = (*RegionCounterMap)[S];
- llvm::Value *Args[] = {FuncNameVar,
- Builder.getInt64(FunctionHash),
+ // Make sure that pointer to global is passed in with zero addrspace
+ // This is relevant during GPU profiling
+ auto *I8Ty = llvm::Type::getInt8Ty(CGM.getLLVMContext());
+ auto *I8PtrTy = llvm::PointerType::getUnqual(I8Ty);
+ auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ FuncNameVar, I8PtrTy);
+
+ llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash),
Builder.getInt32(NumRegionCounters),
Builder.getInt32(Counter), StepV};
if (!StepV)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index d22d2a8e948b00..1d887d5cb58127 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -503,6 +503,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
+__OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr)
+__OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64)
+
__OMP_RTL(__last, false, Void, )
#undef __OMP_RTL
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 36be2e7d869e7b..32648e4a67ad9e 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -171,6 +171,10 @@ inline StringRef getInstrProfCounterBiasVarName() {
/// Return the marker used to separate PGO names during serialization.
inline StringRef getInstrProfNameSeparator() { return "\01"; }
+/// Determines whether module targets a GPU eligable for PGO
+/// instrumentation
+bool isGPUProfTarget(const Module &M);
+
/// Please use getIRPGOFuncName for LLVM IR instrumentation. This function is
/// for front-end (Clang, etc) instrumentation.
/// Return the modified name for function \c F suitable to be
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 134a400e639c4b..cdcd6840bb5108 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -428,13 +428,22 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
return VarName;
}
+bool isGPUProfTarget(const Module &M) {
+ const auto &triple = M.getTargetTriple();
+ return triple.rfind("nvptx", 0) == 0 || triple.rfind("amdgcn", 0) == 0 ||
+ triple.rfind("r600", 0) == 0;
+}
+
GlobalVariable *createPGOFuncNameVar(Module &M,
GlobalValue::LinkageTypes Linkage,
StringRef PGOFuncName) {
+ // Ensure profiling variables on GPU are visible to be read from host
+ if (isGPUProfTarget(M))
+ Linkage = GlobalValue::ExternalLinkage;
// We generally want to match the function's linkage, but available_externally
// and extern_weak both have the wrong semantics, and anything that doesn't
// need to link across compilation units doesn't need to be visible at all.
- if (Linkage == GlobalValue::ExternalWeakLinkage)
+ else if (Linkage == GlobalValue::ExternalWeakLinkage)
Linkage = GlobalValue::LinkOnceAnyLinkage;
else if (Linkage == GlobalValue::AvailableExternallyLinkage)
Linkage = GlobalValue::LinkOnceODRLinkage;
@@ -448,8 +457,12 @@ GlobalVariable *createPGOFuncNameVar(Module &M,
new GlobalVariable(M, Value->getType(), true, Linkage, Value,
getPGOFuncNameVarName(PGOFuncName, Linkage));
+ // If the target is a GPU, make the symbol protected so it can
+ // be read from the host device
+ if (isGPUProfTarget(M))
+ FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility);
// Hide the symbol so that we correctly get a copy for each executable.
- if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
+ else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
return FuncNameVar;
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index fe5a0578bd9721..61fba7be3ee0ee 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1481,6 +1481,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
+ if (isGPUProfTarget(M)) {
+ Linkage = GlobalValue::ExternalLinkage;
+ Visibility = GlobalValue::ProtectedVisibility;
+ }
// If the data variable is not referenced by code (if we don't emit
// @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the
// data variable live under linker GC, the data variable can be private. This
@@ -1492,9 +1496,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees
// that other copies must have the same CFG and cannot have value profiling.
// If no hash suffix, other profd copies may be referenced by code.
- if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
- (TT.isOSBinFormatELF() ||
- (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
+ else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
+ (TT.isOSBinFormatELF() ||
+ (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
Linkage = GlobalValue::PrivateLinkage;
Visibility = GlobalValue::DefaultVisibility;
}
@@ -1617,6 +1621,13 @@ void InstrLowerer::emitNameData() {
NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
GlobalValue::PrivateLinkage, NamesVal,
getInstrProfNamesVarName());
+
+ // Make names variable public if current target is a GPU
+ if (isGPUProfTarget(M)) {
+ NamesVar->setLinkage(GlobalValue::ExternalLinkage);
+ NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility);
+ }
+
NamesSize = CompressedNameStr.size();
setGlobalVariableLargeSection(TT, *NamesVar);
NamesVar->setSection(
@@ -1658,10 +1669,13 @@ void InstrLowerer::emitRegistration() {
IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", RegisterF));
for (Value *Data : CompilerUsedVars)
if (!isa<Function>(Data))
- IRB.CreateCall(RuntimeRegisterF, Data);
+ // Check for addrspace cast when profiling GPU
+ IRB.CreateCall(RuntimeRegisterF,
+ IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
for (Value *Data : UsedVars)
if (Data != NamesVar && !isa<Function>(Data))
- IRB.CreateCall(RuntimeRegisterF, Data);
+ IRB.CreateCall(RuntimeRegisterF,
+ IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
if (NamesVar) {
Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
@@ -1670,7 +1684,9 @@ void InstrLowerer::emitRegistration() {
auto *NamesRegisterF =
Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
getInstrProfNamesRegFuncName(), M);
- IRB.CreateCall(NamesRegisterF, {NamesVar, IRB.getInt64(NamesSize)});
+ IRB.CreateCall(NamesRegisterF, {IRB.CreatePointerBitCastOrAddrSpaceCast(
+ NamesVar, VoidPtrTy),
+ IRB.getInt64(NamesSize)});
}
IRB.CreateRetVoid();
@@ -1691,7 +1707,10 @@ bool InstrLowerer::emitRuntimeHook() {
auto *Var =
new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
nullptr, getInstrProfRuntimeHookVarName());
- Var->setVisibility(GlobalValue::HiddenVisibility);
+ if (isGPUProfTarget(M))
+ Var->setVisibility(GlobalValue::ProtectedVisibility);
+ else
+ Var->setVisibility(GlobalValue::HiddenVisibility);
if (TT.isOSBinFormatELF() && !TT.isPS()) {
// Mark the user variable as used so that it isn't stripped out.
diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
index 1ce3e1e40a80ab..55ee15d068c67b 100644
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -89,6 +89,7 @@ set(include_files
${include_directory}/Interface.h
${include_directory}/LibC.h
${include_directory}/Mapping.h
+ ${include_directory}/Profiling.h
${include_directory}/State.h
${include_directory}/Synchronization.h
${include_directory}/Types.h
@@ -104,6 +105,7 @@ set(src_files
${source_directory}/Mapping.cpp
${source_directory}/Misc.cpp
${source_directory}/Parallelism.cpp
+ ${source_directory}/Profiling.cpp
${source_directory}/Reduction.cpp
${source_directory}/State.cpp
${source_directory}/Synchronization.cpp
diff --git a/openmp/libomptarget/DeviceRTL/include/Profiling.h b/openmp/libomptarget/DeviceRTL/include/Profiling.h
new file mode 100644
index 00000000000000..68c7744cd60752
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Profiling.h
@@ -0,0 +1,21 @@
+//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_PROFILING_H
+#define OMPTARGET_DEVICERTL_PROFILING_H
+
+extern "C" {
+
+void __llvm_profile_register_function(void *ptr);
+void __llvm_profile_register_names_function(void *ptr, long int i);
+}
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/src/Profiling.cpp b/openmp/libomptarget/DeviceRTL/src/Profiling.cpp
new file mode 100644
index 00000000000000..799477f5e47d27
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Profiling.cpp
@@ -0,0 +1,19 @@
+//===------- Profiling.cpp ---------------------------------------- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Profiling.h"
+
+#pragma omp begin declare target device_type(nohost)
+
+extern "C" {
+
+void __llvm_profile_register_function(void *ptr) {}
+void __llvm_profile_register_names_function(void *ptr, long int i) {}
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index d9fe938790ca76..a803b3f76d8b25 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -14,9 +14,11 @@
#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
#include <string>
+#include <vector>
#include "llvm/ADT/DenseMap.h"
#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ProfileData/InstrProf.h"
#include "Shared/Debug.h"
#include "Shared/Utils.h"
@@ -58,6 +60,22 @@ class GlobalTy {
void setPtr(void *P) { Ptr = P; }
};
+typedef void *IntPtrT;
+struct __llvm_profile_data {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+/// PGO profiling data extracted from a GPU device
+struct GPUProfGlobals {
+ std::string names;
+ std::vector<std::vector<int64_t>> counts;
+ std::vector<__llvm_profile_data> data;
+ Triple targetTriple;
+
+ void dump() const;
+};
+
/// Subclass of GlobalTy that holds the memory for a global of \p Ty.
template <typename Ty> class StaticGlobalTy : public GlobalTy {
Ty Data;
@@ -167,6 +185,15 @@ class GenericGlobalHandlerTy {
return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
/* D2H */ false);
}
+
+ /// Checks whether a given image contains profiling globals.
+ bool hasProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image);
+
+ /// Reads profiling data from a GPU image to supplied profdata struct.
+ /// Iterates through the image symbol table and stores global values
+ /// with profiling prefixes.
+ Expected<GPUProfGlobals> readProfilingGlobals(GenericDeviceTy &Device,
+ DeviceImageTy &Image);
};
} // namespace plugin
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index d398f60c55bd13..cb71b61f4a9c4f 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -163,3 +163,85 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
return Plugin::success();
}
+
+bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device,
+ DeviceImageTy &Image) {
+ GlobalTy global(getInstrProfNamesVarName().str(), 0);
+ if (auto Err = getGlobalMetadataFromImage(Device, Image, global)) {
+ consumeError(std::move(Err));
+ return false;
+ }
+ return true;
+}
+
+Expected<GPUProfGlobals>
+GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
+ DeviceImageTy &Image) {
+ GPUProfGlobals profdata;
+ const auto *elf = getOrCreateELFObjectFile(Device, Image);
+ profdata.targetTriple = elf->makeTriple();
+ // Iterate through
+ for (auto &sym : elf->symbols()) {
+ if (auto name = sym.getName()) {
+ // Check if given current global is a profiling global based
+ // on name
+ if (name->equals(getInstrProfNamesVarName())) {
+ // Read in profiled function names
+ std::vector<char> chars(sym.getSize() / sizeof(char), ' ');
+ GlobalTy NamesGlobal(name->str(), sym.getSize(), chars.data());
+ if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal))
+ return Err;
+ std::string names(chars.begin(), chars.end());
+ profdata.names = std::move(names);
+ } else if (name->starts_with(getInstrProfCountersVarPrefix())) {
+ // Read global variable profiling counts
+ std::vector<int64_t> counts(sym.getSize() / sizeof(int64_t), 0);
+ GlobalTy CountGlobal(name->str(), sym.getSize(), counts.data());
+ if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
+ return Err;
+ profdata.counts.push_back(std::move(counts));
+ } else if (name->starts_with(getInstrProfDataVarPrefix())) {
+ // Read profiling data for this global variable
+ __llvm_profile_data data{};
+ GlobalTy DataGlobal(name->str(), sym.getSize(), &data);
+ if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal))
+ return Err;
+ profdata.data.push_back(std::move(data));
+ }
+ }
+ }
+ return profdata;
+}
+
+void GPUProfGlobals::dump() const {
+ llvm::outs() << "======= GPU Profile =======\nTarget: " << targetTriple.str()
+ << "\n";
+
+ llvm::outs() << "======== Counters =========\n";
+ for (const auto &count : counts) {
+ llvm::outs() << "[";
+ for (size_t i = 0; i < count.size(); i++) {
+ if (i == 0)
+ llvm::outs() << " ";
+ llvm::outs() << count[i] << " ";
+ }
+ llvm::outs() << "]\n";
+ }
+
+ llvm::outs() << "========== Data ===========\n";
+ for (const auto &d : data) {
+ llvm::outs() << "{ ";
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \
+ llvm::outs() << d.Name << " ";
+#include "llvm/ProfileData/InstrProfData.inc"
+ llvm::outs() << " }\n";
+ }
+
+ llvm::outs() << "======== Functions ========\n";
+ InstrProfSymtab symtab;
+ if (Error Err = symtab.create(StringRef(names))) {
+ consumeError(std::move(Err));
+ }
+ symtab.dumpNames(llvm::outs());
+ llvm::outs() << "===========================\n";
+}
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 178c60a77ab51f..3d218570a49445 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -817,6 +817,20 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
DeviceMemoryPoolTracking.AllocationMax);
}
+ for (auto *Image : LoadedImages) {
+ GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
+ if (!Handler.hasProfilingGlobals(*this, *Image))
+ continue;
+
+ GPUProfGlobals profdata;
+ auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
+ if (!ProfOrErr)
+ return ProfOrErr.takeError();
+
+ // TODO: write data to profiling file
+ ProfOrErr->dump();
+ }
+
// Delete the memory manager before deinitializing the device. Otherwise,
// we may delete device allocations after the device is deinitialized.
if (MemoryManager)
``````````
</details>
https://github.com/llvm/llvm-project/pull/76587
More information about the cfe-commits
mailing list