[Openmp-commits] [openmp] b82ac74 - [OpenMP][AMDGPU] More detail in AMDGPU kernel launch info

JP Lehr via Openmp-commits openmp-commits at lists.llvm.org
Tue Feb 28 04:42:20 PST 2023


Author: JP Lehr
Date: 2023-02-28T07:41:48-05:00
New Revision: b82ac74f7e184319c4f379234915efc8d096f048

URL: https://github.com/llvm/llvm-project/commit/b82ac74f7e184319c4f379234915efc8d096f048
DIFF: https://github.com/llvm/llvm-project/commit/b82ac74f7e184319c4f379234915efc8d096f048.diff

LOG: [OpenMP][AMDGPU] More detail in AMDGPU kernel launch info

Makes the info that is printed for kernel launches configurable for
different plugins. Adds all machinery to print the detailed launch
info that the current AMD plugin provides and includes e.g. register
spill counts.

The files msgpack.cpp, msgpack.def, and msgpack.h are copied from the old plugin
and are untouched. The contents of UtilitiesHSA.cpp and .h are copied together from
various files from the old plugin. The code was originally written by
Jon Chesterfield. I updated the function and type names visible to the outside, i.e.
in headers, to respect the LLVM conventions.

Reviewed By: jhuber6

Differential Revision: https://reviews.llvm.org/D144521

Added: 
    

Modified: 
    openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
    openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
    openmp/libomptarget/test/offloading/info.c

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index c28081c439345..40c616c249fdd 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -373,10 +373,22 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
   Expected<hsa_executable_symbol_t>
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
 
+  /// Get additional info for kernel, e.g., register spill counts
+  std::optional<utils::KernelMetaDataTy>
+  getKernelInfo(StringRef Identifier) const {
+    auto It = KernelInfoMap.find(Identifier);
+
+    if (It == KernelInfoMap.end())
+      return {};
+
+    return It->second;
+  }
+
 private:
   /// The exectuable loaded on the agent.
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
+  StringMap<utils::KernelMetaDataTy> KernelInfoMap;
 };
 
 /// Class implementing the AMDGPU kernel functionalities which derives from the
@@ -426,6 +438,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
     // TODO: Read the kernel descriptor for the max threads per block. May be
     // read from the image.
 
+    // Get additional kernel info read from image
+    KernelInfo = AMDImage.getKernelInfo(getName());
+    if (!KernelInfo.has_value())
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
+           "Could not read extra information for kernel %s.", getName());
+
     return Plugin::success();
   }
 
@@ -434,6 +452,11 @@ struct AMDGPUKernelTy : public GenericKernelTy {
                    uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Print more elaborate kernel launch info for AMDGPU
+  Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
+                               KernelArgsTy &KernelArgs, uint32_t NumThreads,
+                               uint64_t NumBlocks) const override;
+
   /// The default number of blocks is common to the whole device.
   uint32_t getDefaultNumBlocks(GenericDeviceTy &GenericDevice) const override {
     return GenericDevice.getDefaultNumBlocks();
@@ -462,6 +485,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
 
   /// The size of implicit kernel arguments.
   const uint32_t ImplicitArgsSize;
+
+  /// Additional Info for the AMD GPU Kernel
+  std::optional<utils::KernelMetaDataTy> KernelInfo;
 };
 
 /// Class representing an HSA signal. Signals are used to define dependencies
@@ -2200,6 +2226,10 @@ Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
   if (Result)
     return Plugin::error("Loaded HSA executable does not validate");
 
+  if (auto Err =
+          utils::readAMDGPUMetaDataFromImage(getMemoryBuffer(), KernelInfoMap))
+    return Err;
+
   return Plugin::success();
 }
 
@@ -2571,6 +2601,50 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                  GroupSize, ArgsMemoryManager);
 }
 
+Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
+                                             KernelArgsTy &KernelArgs,
+                                             uint32_t NumThreads,
+                                             uint64_t NumBlocks) const {
+  // Only do all this when the output is requested
+  if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
+    return Plugin::success();
+
+  // We don't have data to print additional info, but no hard error
+  if (!KernelInfo.has_value())
+    return Plugin::success();
+
+  // General Info
+  auto ConstWGSize = getDefaultNumThreads(GenericDevice);
+  auto NumGroups = NumBlocks;
+  auto ThreadsPerGroup = getDefaultNumThreads(GenericDevice);
+  auto NumTeams = KernelArgs.NumTeams[0];       // Only first dimension
+  auto ThreadLimit = KernelArgs.ThreadLimit[0]; // Only first dimension
+
+  // Kernel Arguments Info
+  auto ArgNum = KernelArgs.NumArgs;
+  auto LoopTripCount = KernelArgs.Tripcount;
+
+  // Details for AMDGPU kernels
+  auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
+  auto SGPRCount = (*KernelInfo).SGPRCount;
+  auto VGPRCount = (*KernelInfo).VGPRCount;
+  auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
+  auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
+
+  // TODO set correctly once host services available
+  auto HostCallRequired = false;
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
+       "SGN:%s ConstWGSize:%d args:%d teamsXthrds:(%4dX%4d) "
+       "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
+       "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d n:%s\n",
+       getExecutionModeName(), ConstWGSize, ArgNum, NumGroups, ThreadsPerGroup,
+       NumTeams, ThreadLimit, GroupSegmentSize, SGPRCount, VGPRCount,
+       SGPRSpillCount, VGPRSpillCount, LoopTripCount, HostCallRequired,
+       getName());
+
+  return Plugin::success();
+}
+
 GenericPluginTy *Plugin::createPlugin() { return new AMDGPUPluginTy(); }
 
 GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) {

diff  --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index d3f299dc1c6fe..1c549189b5c4d 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -17,6 +17,14 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+
+#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
+#include "llvm/Support/MemoryBufferRef.h"
+
+#include "llvm/Support/YAMLTraits.h"
 
 namespace llvm {
 namespace omp {
@@ -127,6 +135,166 @@ bool isImageCompatibleWithEnv(const __tgt_image_info *Info,
   return true;
 }
 
+struct KernelMetaDataTy {
+  uint64_t KernelObject;
+  uint32_t GroupSegmentList;
+  uint32_t PrivateSegmentSize;
+  uint32_t SGPRCount;
+  uint32_t VGPRCount;
+  uint32_t SGPRSpillCount;
+  uint32_t VGPRSpillCount;
+  uint32_t KernelSegmentSize;
+  uint32_t ExplicitArgumentCount;
+  uint32_t ImplicitArgumentCount;
+};
+namespace {
+
+/// Reads the AMDGPU specific per-kernel-metadata from an image.
+class KernelInfoReader {
+public:
+  KernelInfoReader(StringMap<KernelMetaDataTy> &KIM) : KernelInfoMap(KIM) {}
+
+  /// Process ELF note to read AMDGPU metadata from respective information
+  /// fields.
+  Error processNote(const object::ELF64LE::Note &Note) {
+    if (Note.getName() != "AMDGPU")
+      return Error::success(); // We are not interested in other things
+
+    assert(Note.getType() == ELF::NT_AMDGPU_METADATA &&
+           "Parse AMDGPU MetaData");
+    auto Desc = Note.getDesc();
+    StringRef MsgPackString =
+        StringRef(reinterpret_cast<const char *>(Desc.data()), Desc.size());
+    msgpack::Document MsgPackDoc;
+    if (!MsgPackDoc.readFromBlob(MsgPackString, /*Multi=*/false))
+      return Error::success();
+
+    AMDGPU::HSAMD::V3::MetadataVerifier Verifier(true);
+    if (!Verifier.verify(MsgPackDoc.getRoot()))
+      return Error::success();
+
+    auto RootMap = MsgPackDoc.getRoot().getMap(true);
+
+    if (auto Err = iterateAMDKernels(RootMap))
+      return Err;
+
+    return Error::success();
+  }
+
+private:
+  /// Extracts the relevant information via simple string look-up in the msgpack
+  /// document elements.
+  Error extractKernelData(msgpack::MapDocNode::MapTy::value_type V,
+                          std::string &KernelName,
+                          KernelMetaDataTy &KernelData) {
+    if (!V.first.isString())
+      return Error::success();
+
+    const auto isKey = [](const msgpack::DocNode &DK, StringRef SK) {
+      return DK.getString() == SK;
+    };
+
+    if (isKey(V.first, ".name")) {
+      KernelName = V.second.toString();
+    } else if (isKey(V.first, ".sgpr_count")) {
+      KernelData.SGPRCount = V.second.getUInt();
+    } else if (isKey(V.first, ".sgpr_spill_count")) {
+      KernelData.SGPRSpillCount = V.second.getUInt();
+    } else if (isKey(V.first, ".vgpr_count")) {
+      KernelData.VGPRCount = V.second.getUInt();
+    } else if (isKey(V.first, ".vgpr_spill_count")) {
+      KernelData.VGPRSpillCount = V.second.getUInt();
+    } else if (isKey(V.first, ".private_segment_fixed_size")) {
+      KernelData.PrivateSegmentSize = V.second.getUInt();
+    } else if (isKey(V.first, ".group_segement_fixed_size")) {
+      KernelData.GroupSegmentList = V.second.getUInt();
+    }
+
+    return Error::success();
+  }
+
+  /// Get the "amdhsa.kernels" element from the msgpack Document
+  Expected<msgpack::ArrayDocNode> getAMDKernelsArray(msgpack::MapDocNode &MDN) {
+    auto Res = MDN.find("amdhsa.kernels");
+    if (Res == MDN.end())
+      return createStringError(inconvertibleErrorCode(),
+                               "Could not find amdhsa.kernels key");
+
+    auto Pair = *Res;
+    assert(Pair.second.isArray() &&
+           "AMDGPU kernel entries are arrays of entries");
+
+    return Pair.second.getArray();
+  }
+
+  /// Iterate all entries for one "amdhsa.kernels" entry. Each entry is a
+  /// MapDocNode that either maps a string to a single value (most of them) or
+  /// to another array of things. Currently, we only handle the case that maps
+  /// to scalar value.
+  Error generateKernelInfo(msgpack::ArrayDocNode::ArrayTy::iterator It) {
+    KernelMetaDataTy KernelData;
+    std::string KernelName;
+    auto Entry = (*It).getMap();
+    for (auto MI = Entry.begin(), E = Entry.end(); MI != E; ++MI)
+      if (auto Err = extractKernelData(*MI, KernelName, KernelData))
+        return Err;
+
+    KernelInfoMap.insert({KernelName, KernelData});
+    return Error::success();
+  }
+
+  /// Go over the list of AMD kernels in the "amdhsa.kernels" entry
+  Error iterateAMDKernels(msgpack::MapDocNode &MDN) {
+    auto KernelsOrErr = getAMDKernelsArray(MDN);
+    if (auto Err = KernelsOrErr.takeError())
+      return Err;
+
+    auto KernelsArr = *KernelsOrErr;
+    for (auto It = KernelsArr.begin(), E = KernelsArr.end(); It != E; ++It) {
+      if (!It->isMap())
+        continue; // we expect <key,value> pairs
+
+      // Obtain the value for the 
diff erent entries. Each array entry is a
+      // MapDocNode
+      if (auto Err = generateKernelInfo(It))
+        return Err;
+    }
+    return Error::success();
+  }
+
+  // Kernel names are the keys
+  StringMap<KernelMetaDataTy> &KernelInfoMap;
+};
+} // namespace
+
+/// Reads the AMDGPU specific metadata from the ELF file and propagates the
+/// KernelInfoMap
+Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
+                                  StringMap<KernelMetaDataTy> &KernelInfoMap) {
+  Error Err = Error::success(); // Used later as out-parameter
+
+  auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
+  if (auto Err = ELFOrError.takeError())
+    return Err;
+
+  const object::ELF64LEFile ELFObj = ELFOrError.get();
+  ArrayRef<object::ELF64LE::Shdr> Sections = cantFail(ELFObj.sections());
+  KernelInfoReader Reader(KernelInfoMap);
+  for (const auto &S : Sections) {
+    if (S.sh_type != ELF::SHT_NOTE)
+      continue;
+
+    for (const auto N : ELFObj.notes(S, Err)) {
+      if (Err)
+        return Err;
+      // Fills the KernelInfoTabel entries in the reader
+      if ((Err = Reader.processNote(N)))
+        return Err;
+    }
+  }
+
+  return Error::success();
+}
 } // namespace utils
 } // namespace plugin
 } // namespace target

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 43ef2c822713f..c8c4a60d0bdf0 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -218,6 +218,25 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
   return initImpl(GenericDevice, Image);
 }
 
+Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice,
+                                       KernelArgsTy &KernelArgs,
+                                       uint32_t NumThreads,
+                                       uint64_t NumBlocks) const {
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
+       "Launching kernel %s with %" PRIu64
+       " blocks and %d threads in %s mode\n",
+       getName(), NumBlocks, NumThreads, getExecutionModeName());
+  return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads,
+                                NumBlocks);
+}
+
+Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
+                                              KernelArgsTy &KernelArgs,
+                                              uint32_t NumThreads,
+                                              uint64_t NumBlocks) const {
+  return Plugin::success();
+}
+
 Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
                               ptr
diff _t *ArgOffsets, KernelArgsTy &KernelArgs,
                               AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -232,10 +251,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
   uint64_t NumBlocks = getNumBlocks(GenericDevice, KernelArgs.NumTeams,
                                     KernelArgs.Tripcount, NumThreads);
 
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
-       "Launching kernel %s with %" PRIu64
-       " blocks and %d threads in %s mode\n",
-       getName(), NumBlocks, NumThreads, getExecutionModeName());
+  if (auto Err =
+          printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
+    return Err;
 
   return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
                     KernelArgsPtr, AsyncInfoWrapper);

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 274f632df9a99..4cc97485054c5 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -196,6 +196,32 @@ struct GenericKernelTy {
     return false;
   }
 
+protected:
+  /// Get the execution mode name of the kernel.
+  const char *getExecutionModeName() const {
+    switch (ExecutionMode) {
+    case OMP_TGT_EXEC_MODE_SPMD:
+      return "SPMD";
+    case OMP_TGT_EXEC_MODE_GENERIC:
+      return "Generic";
+    case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
+      return "Generic-SPMD";
+    }
+    llvm_unreachable("Unknown execution mode!");
+  }
+
+  /// Prints generic kernel launch information.
+  Error printLaunchInfo(GenericDeviceTy &GenericDevice,
+                        KernelArgsTy &KernelArgs, uint32_t NumThreads,
+                        uint64_t NumBlocks) const;
+
+  /// Prints plugin-specific kernel launch information after generic kernel
+  /// launch information
+  virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
+                                       KernelArgsTy &KernelArgs,
+                                       uint32_t NumThreads,
+                                       uint64_t NumBlocks) const;
+
 private:
   /// Prepare the arguments before launching the kernel.
   void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
@@ -225,19 +251,6 @@ struct GenericKernelTy {
   }
   bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; }
 
-  /// Get the execution mode name of the kernel.
-  const char *getExecutionModeName() const {
-    switch (ExecutionMode) {
-    case OMP_TGT_EXEC_MODE_SPMD:
-      return "SPMD";
-    case OMP_TGT_EXEC_MODE_GENERIC:
-      return "Generic";
-    case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
-      return "Generic-SPMD";
-    }
-    llvm_unreachable("Unknown execution mode!");
-  }
-
   /// The kernel name.
   const char *Name;
 

diff  --git a/openmp/libomptarget/test/offloading/info.c b/openmp/libomptarget/test/offloading/info.c
index fa8f292a7303d..f24727400bcdb 100644
--- a/openmp/libomptarget/test/offloading/info.c
+++ b/openmp/libomptarget/test/offloading/info.c
@@ -1,7 +1,9 @@
 // RUN: %libomptarget-compile-generic \
 // RUN:     -gline-tables-only -fopenmp-extensions
 // RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
-// RUN:   %fcheck-generic -allow-empty -check-prefix=INFO
+// RUN:   %fcheck-generic -allow-empty -check-prefixes=INFO
+// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 | \
+// RUN:   %fcheck-amdgcn-amd-amdhsa -allow-empty -check-prefixes=INFO,AMDGPU
 
 #include <omp.h>
 #include <stdio.h>
@@ -37,6 +39,7 @@ int main() {
 // INFO: info: Entering OpenMP kernel at info.c:{{[0-9]+}}:{{[0-9]+}} with 1 arguments:
 // INFO: info: firstprivate(val)[4]
 // INFO: info: Launching kernel __omp_offloading_{{.*}}main{{.*}} with {{[0-9]+}} blocks and {{[0-9]+}} threads in Generic mode
+// AMDGPU: AMDGPU device {{[0-9]}} info: SGN:Generic ConstWGSize:{{[0-9]+}} args:{{[0-9]}} teamsXthrds:({{   [0-9]+}}X {{[0-9]+}}) reqd:(   {{[0-9]+}}X   {{[0-9]+}}) lds_usage:{{[0-9]+}}B sgpr_count:{{[0-9]+}} vgpr_count:{{[0-9]+}} sgpr_spill_count:{{[0-9]+}} vgpr_spill_count:{{[0-9]+}} tripcount:{{[0-9]+}} rpc:0 n:__omp_offloading_{{.*}}main{{.*}}
 // INFO: info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}:
 // INFO: info: Host Ptr           Target Ptr         Size (B) DynRefCount HoldRefCount Declaration
 // INFO: info: {{.*}}             {{.*}}             256      1           0            C[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}


        


More information about the Openmp-commits mailing list