[Openmp-commits] [openmp] 94c772d - [OpenMP] Support kernel record and replay

Giorgis Georgakoudis via Openmp-commits openmp-commits at lists.llvm.org
Tue Jan 17 16:29:09 PST 2023


Author: Giorgis Georgakoudis
Date: 2023-01-17T16:29:03-08:00
New Revision: 94c772dc923a63abc744c011db51cecfe80cf093

URL: https://github.com/llvm/llvm-project/commit/94c772dc923a63abc744c011db51cecfe80cf093
DIFF: https://github.com/llvm/llvm-project/commit/94c772dc923a63abc744c011db51cecfe80cf093.diff

LOG: [OpenMP] Support kernel record and replay

This patch adds functionality for recording and replaying the execution of OpenMP offload kernels, based on an original implementation by Steve Rangel. The patch extends libomptarget to extract a json description of the kernel, the device image binary, and a device memory snapshot before and after the execution of a recorded kernel. Kernel recording/replaying in libomptarget is controlled through env vars (LIBOMPTARGET_RECORD, LIBOMPTARGET_REPLAY). It provides a tool, llvm-omp-kernel-replay, for replaying a kernel using the extracted information with the ability to verify replayed execution using the post-execution device memory snapshot, also supporting changing the number of teams/threads for replaying.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D138931

Added: 
    openmp/libomptarget/tools/kernelreplay/CMakeLists.txt
    openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp

Modified: 
    openmp/libomptarget/include/omptarget.h
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
    openmp/libomptarget/src/exports
    openmp/libomptarget/src/interface.cpp
    openmp/libomptarget/src/omptarget.cpp
    openmp/libomptarget/src/private.h
    openmp/libomptarget/tools/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 93f680010caf2..76b686794e22e 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -416,6 +416,14 @@ int __tgt_target_kernel_nowait(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
 // data.
 void __tgt_target_nowait_query(void **AsyncHandle);
 
+/// Executes a target kernel by replaying recorded kernel arguments and
+/// device memory.
+int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
+                               void *DeviceMemory, int64_t DeviceMemorySize,
+                               void **TgtArgs, ptr
diff _t *TgtOffsets,
+                               int32_t NumArgs, int32_t NumTeams,
+                               int32_t ThreadLimit, uint64_t LoopTripCount);
+
 void __tgt_set_info_flag(uint32_t);
 
 int __tgt_print_device_info(int64_t DeviceId);

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 5d160ad4d27fa..d20fbf644ffed 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -18,6 +18,8 @@
 
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
 
 #include <cstdint>
 #include <limits>
@@ -29,6 +31,168 @@ using namespace plugin;
 
 GenericPluginTy *Plugin::SpecificPlugin = nullptr;
 
+// TODO: Fix any thread safety issues for multi-threaded kernel recording.
+struct RecordReplayTy {
+private:
+  // Memory pointers for recording, replaying memory.
+  void *MemoryStart;
+  void *MemoryPtr;
+  size_t MemorySize;
+  GenericDeviceTy *Device;
+  std::mutex AllocationLock;
+
+  // Environment variables for record and replay.
+  // Enables recording kernels if set.
+  BoolEnvar OMPX_RecordKernel;
+  // Enables replaying a kernel if set.
+  BoolEnvar OMPX_ReplayKernel;
+  // Enables saving the device memory kernel output post execution if set.
+  BoolEnvar OMPX_ReplaySaveOutput;
+  // Sets the maximum to pre-allocate device memory.
+  UInt32Envar OMPX_DeviceMemorySize;
+
+  // Record/replay pre-allocates the largest possible device memory using the
+  // default kind.
+  // TODO: Expand allocation to include other kinds (device, host, shared) and
+  // possibly use a MemoryManager to track (de-)allocations for
+  // storing/retrieving when recording/replaying.
+  Error preallocateDeviceMemory() {
+    // Pre-allocate memory on device. Starts with 64GB and subtracts in steps
+    // of 1GB until allocation succeeds.
+    const size_t MAX_MEMORY_ALLOCATION =
+        OMPX_DeviceMemorySize * 1024 * 1024 * 1024ULL;
+    constexpr size_t STEP = 1024 * 1024 * 1024ULL;
+    MemoryStart = nullptr;
+    for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
+      MemoryStart =
+          Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+      if (MemoryStart)
+        break;
+    }
+
+    if (!MemoryStart)
+      return Plugin::error("Allocating record/replay memory");
+
+    MemoryPtr = MemoryStart;
+    MemorySize = 0;
+
+    return Plugin::success();
+  }
+
+  void dumpDeviceMemory(StringRef Filename,
+                        AsyncInfoWrapperTy &AsyncInfoWrapper) {
+    ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
+        WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
+    if (!DeviceMemoryMB)
+      report_fatal_error("Error creating MemoryBuffer for device memory");
+
+    auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
+                                    MemoryStart, MemorySize, AsyncInfoWrapper);
+    if (Err)
+      report_fatal_error("Error retrieving data for target pointer");
+
+    StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize);
+    std::error_code EC;
+    raw_fd_ostream OS(Filename, EC);
+    if (EC)
+      report_fatal_error("Error dumping memory to file " + Filename + " :" +
+                         EC.message());
+    OS << DeviceMemory;
+    OS.close();
+  }
+
+public:
+  bool isRecording() const { return OMPX_RecordKernel; }
+  bool isReplaying() const { return OMPX_ReplayKernel; }
+  bool isRecordingOrReplaying() const {
+    return (OMPX_RecordKernel || OMPX_ReplayKernel);
+  }
+  bool isSaveOutputEnabled() const { return OMPX_ReplaySaveOutput; }
+
+  RecordReplayTy()
+      : OMPX_RecordKernel("LIBOMPTARGET_RECORD"),
+        OMPX_ReplayKernel("LIBOMPTARGET_REPLAY"),
+        OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT"),
+        OMPX_DeviceMemorySize("LIBOMPTARGET_RR_DEVMEM_SIZE",
+                              /* Default in GB */ 64) {}
+
+  void saveImage(const char *Name, DeviceImageTy &Image) {
+    Twine ImageName = Twine(Name) + Twine(".image");
+    std::error_code EC;
+    raw_fd_ostream OS(ImageName.str(), EC);
+    if (EC)
+      report_fatal_error("Error saving image : " + StringRef(EC.message()));
+    OS << Image.getMemoryBuffer().getBuffer();
+    OS.close();
+  }
+
+  void saveKernelInputInfo(const char *Name, void **ArgPtrs,
+                           ptr
diff _t *ArgOffsets, int32_t NumArgs,
+                           uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
+                           uint64_t LoopTripCount,
+                           AsyncInfoWrapperTy &AsyncInfoWrapper) {
+    json::Object JsonKernelInfo;
+    JsonKernelInfo["Name"] = Name;
+    JsonKernelInfo["NumArgs"] = NumArgs;
+    JsonKernelInfo["NumTeamsClause"] = NumTeamsClause;
+    JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause;
+    JsonKernelInfo["LoopTripCount"] = LoopTripCount;
+    JsonKernelInfo["DeviceMemorySize"] = MemorySize;
+    JsonKernelInfo["DeviceId"] = Device->getDeviceId();
+
+    json::Array JsonArgPtrs;
+    for (int I = 0; I < NumArgs; ++I)
+      JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
+    JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
+
+    json::Array JsonArgOffsets;
+    for (int I = 0; I < NumArgs; ++I)
+      JsonArgOffsets.push_back(ArgOffsets[I]);
+    JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
+
+    Twine KernelName(Name);
+    Twine MemoryFilename = KernelName + ".memory";
+    dumpDeviceMemory(MemoryFilename.str(), AsyncInfoWrapper);
+
+    Twine JsonFilename = KernelName + ".json";
+    std::error_code EC;
+    raw_fd_ostream JsonOS(JsonFilename.str(), EC);
+    if (EC)
+      report_fatal_error("Error saving kernel json file : " +
+                         StringRef(EC.message()));
+    JsonOS << json::Value(std::move(JsonKernelInfo));
+    JsonOS.close();
+  }
+
+  void saveKernelOutputInfo(const char *Name,
+                            AsyncInfoWrapperTy &AsyncInfoWrapper) {
+    Twine OutputFilename =
+        Twine(Name) + (isRecording() ? ".original.output" : ".replay.output");
+    dumpDeviceMemory(OutputFilename.str(), AsyncInfoWrapper);
+  }
+
+  void *alloc(uint64_t Size) {
+    assert(MemoryStart && "Expected memory has been pre-allocated");
+    void *Alloc = nullptr;
+    constexpr int ALIGN = 16;
+    // Assumes alignment is a power of 2.
+    int64_t AlignedSize = Size + (ALIGN - 1) & (~(ALIGN - 1));
+    std::lock_guard<std::mutex> LG(AllocationLock);
+    Alloc = MemoryPtr;
+    MemoryPtr = (char *)MemoryPtr + AlignedSize;
+    MemorySize += AlignedSize;
+    return Alloc;
+  }
+
+  Error init(GenericDeviceTy *Device) {
+    this->Device = Device;
+    return preallocateDeviceMemory();
+  }
+
+  void deinit() { Device->free(MemoryStart); }
+
+} RecordReplay;
+
 AsyncInfoWrapperTy::~AsyncInfoWrapperTy() {
   // If we used a local async info object we want synchronous behavior.
   // In that case, and assuming the current status code is OK, we will
@@ -45,6 +209,9 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 
   DynamicMemorySize = GenericDevice.getDynamicMemorySize();
 
+  if (RecordReplay.isRecording())
+    RecordReplay.saveImage(Name, Image);
+
   return initImpl(GenericDevice, Image);
 }
 
@@ -197,6 +364,10 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
   if (EnableMM)
     MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
 
+  if (RecordReplay.isRecordingOrReplaying())
+    if (auto Err = RecordReplay.init(this))
+      return Err;
+
   return Plugin::success();
 }
 
@@ -207,6 +378,9 @@ Error GenericDeviceTy::deinit() {
     delete MemoryManager;
   MemoryManager = nullptr;
 
+  if (RecordReplay.isRecordingOrReplaying())
+    RecordReplay.deinit();
+
   return deinitImpl();
 }
 
@@ -437,6 +611,9 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
                                             TargetAllocTy Kind) {
   void *Alloc = nullptr;
 
+  if (RecordReplay.isRecordingOrReplaying())
+    return RecordReplay.alloc(Size);
+
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
   case TARGET_ALLOC_DEVICE:
@@ -469,6 +646,10 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
 }
 
 Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
+  // Free is a noop when recording or replaying.
+  if (RecordReplay.isRecordingOrReplaying())
+    return Plugin::success();
+
   int Res;
   if (MemoryManager)
     Res = MemoryManager->free(TgtPtr);
@@ -521,9 +702,20 @@ Error GenericDeviceTy::runTargetTeamRegion(
   GenericKernelTy &GenericKernel =
       *reinterpret_cast<GenericKernelTy *>(EntryPtr);
 
+  if (RecordReplay.isRecording())
+    RecordReplay.saveKernelInputInfo(
+        GenericKernel.getName(), ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause,
+        ThreadLimitClause, LoopTripCount, AsyncInfoWrapper);
+
   Err =
       GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause,
                            ThreadLimitClause, LoopTripCount, AsyncInfoWrapper);
+
+  if (RecordReplay.isRecordingOrReplaying() &&
+      RecordReplay.isSaveOutputEnabled())
+    RecordReplay.saveKernelOutputInfo(GenericKernel.getName(),
+                                      AsyncInfoWrapper);
+
   return Err;
 }
 

diff  --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index 5fa013dcbb839..42682abf7786c 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -27,6 +27,7 @@ VERS1.0 {
     __tgt_target_kernel;
     __tgt_target_kernel_nowait;
     __tgt_target_nowait_query;
+    __tgt_target_kernel_replay;
     __tgt_mapper_num_components;
     __tgt_push_mapper_component;
     __kmpc_push_target_tripcount;

diff  --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index bee3d5be9a8e5..3871e6828a57e 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -265,6 +265,48 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                    HostPtr, Args);
 }
 
+/// Implements a target kernel entry that replays a pre-recorded kernel.
+/// \param Loc Source location associated with this target region (unused).
+/// \param DeviceId The device identifier to execute the target region.
+/// \param HostPtr A pointer to an address that uniquely identifies the kernel.
+/// \param DeviceMemory A pointer to an array storing device memory data to move
+///                     prior to kernel execution.
+/// \param DeviceMemorySize The size of the above device memory data in bytes.
+/// \param TgtArgs An array of pointers of the pre-recorded target kernel
+///                arguments.
+/// \param TgtOffsets An array of pointers of the pre-recorded target kernel
+///                   argument offsets.
+/// \param NumArgs The number of kernel arguments.
+/// \param NumTeams Number of teams to launch the target region with.
+/// \param ThreadLimit Limit to the number of threads to use in kernel
+///                    execution.
+/// \param LoopTripCount The pre-recorded value of the loop tripcount, if any.
+/// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
+EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
+                                      void *HostPtr, void *DeviceMemory,
+                                      int64_t DeviceMemorySize, void **TgtArgs,
+                                      ptr
diff _t *TgtOffsets, int32_t NumArgs,
+                                      int32_t NumTeams, int32_t ThreadLimit,
+                                      uint64_t LoopTripCount) {
+
+  if (checkDeviceAndCtors(DeviceId, Loc)) {
+    DP("Not offloading to device %" PRId64 "\n", DeviceId);
+    return OMP_TGT_FAIL;
+  }
+  DeviceTy &Device = *PM->Devices[DeviceId];
+
+  AsyncInfoTy AsyncInfo(Device);
+  int Rc = target_replay(Loc, Device, HostPtr, DeviceMemory, DeviceMemorySize,
+                         TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit,
+                         LoopTripCount, AsyncInfo);
+  if (Rc == OFFLOAD_SUCCESS)
+    Rc = AsyncInfo.synchronize();
+  handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
+  assert(Rc == OFFLOAD_SUCCESS &&
+         "__tgt_target_kernel_replay unexpected failure!");
+  return OMP_TGT_SUCCESS;
+}
+
 EXTERN int __tgt_target_kernel_nowait(
     ident_t *Loc, int64_t DeviceId, int32_t NumTeams, int32_t ThreadLimit,
     void *HostPtr, __tgt_kernel_arguments *Args, int32_t DepNum, void *DepList,

diff  --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 3476e2d4a2e8f..27eca027d96c5 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -1714,3 +1714,53 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
 
   return OFFLOAD_SUCCESS;
 }
+
+/// Executes a kernel using pre-recorded information for loading to
+/// device memory to launch the target kernel with the pre-recorded
+/// configuration.
+int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
+                  void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
+                  ptr
diff _t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
+                  int32_t ThreadLimit, uint64_t LoopTripCount,
+                  AsyncInfoTy &AsyncInfo) {
+  int32_t DeviceId = Device.DeviceID;
+  TableMap *TM = getTableMap(HostPtr);
+  // Fail if the table map fails to find the target kernel pointer for the
+  // provided host pointer.
+  if (!TM) {
+    REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
+           DPxPTR(HostPtr));
+    return OFFLOAD_FAIL;
+  }
+
+  // Retrieve the target table of offloading entries.
+  __tgt_target_table *TargetTable = nullptr;
+  {
+    std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
+    assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
+           "Not expecting a device ID outside the table's bounds!");
+    TargetTable = TM->Table->TargetsTable[DeviceId];
+  }
+  assert(TargetTable && "Global data has not been mapped\n");
+
+  // Retrieve the target kernel pointer, allocate and store the recorded device
+  // memory data, and launch device execution.
+  void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
+  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
+     TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
+
+  void *TgtPtr = Device.allocData(DeviceMemorySize, /* HstPtr */ nullptr,
+                                  TARGET_ALLOC_DEFAULT);
+  Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
+
+  int Ret =
+      Device.runTeamRegion(TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams,
+                           ThreadLimit, LoopTripCount, AsyncInfo);
+
+  if (Ret != OFFLOAD_SUCCESS) {
+    REPORT("Executing target region abort target.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}

diff  --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 6fc47f8d137f8..521f39f34f18c 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -45,6 +45,12 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
                   uint64_t Tripcount, int IsTeamConstruct,
                   AsyncInfoTy &AsyncInfo);
 
+extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
+                         void *DeviceMemory, int64_t DeviceMemorySize,
+                         void **TgtArgs, ptr
diff _t *TgtOffsets, int32_t NumArgs,
+                         int32_t NumTeams, int32_t ThreadLimit,
+                         uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
+
 extern void handleTargetOutcome(bool Success, ident_t *Loc);
 extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc);
 extern void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,

diff  --git a/openmp/libomptarget/tools/CMakeLists.txt b/openmp/libomptarget/tools/CMakeLists.txt
index 9237035c46580..a850647fbd58e 100644
--- a/openmp/libomptarget/tools/CMakeLists.txt
+++ b/openmp/libomptarget/tools/CMakeLists.txt
@@ -25,3 +25,4 @@ macro(add_openmp_tool_symlink name)
 endmacro()
 
 add_subdirectory(deviceinfo)
+add_subdirectory(kernelreplay)

diff  --git a/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt b/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt
new file mode 100644
index 0000000000000..6f3dc33d6736a
--- /dev/null
+++ b/openmp/libomptarget/tools/kernelreplay/CMakeLists.txt
@@ -0,0 +1,26 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build llvm-omp-kernel-replay tool
+#
+##===----------------------------------------------------------------------===##
+
+libomptarget_say("Building the llvm-omp-kernel-replay tool")
+
+add_openmp_tool(llvm-omp-kernel-replay llvm-omp-kernel-replay.cpp)
+
+llvm_update_compile_flags(llvm-omp-kernel-replay)
+
+target_include_directories(llvm-omp-kernel-replay PRIVATE
+  ${LIBOMPTARGET_INCLUDE_DIR}
+)
+target_link_libraries(llvm-omp-kernel-replay PRIVATE
+  LLVMSupport
+  omp
+  omptarget
+)

diff  --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
new file mode 100644
index 0000000000000..1348e0fa2b1a8
--- /dev/null
+++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -0,0 +1,179 @@
+//===- llvm-omp-kernel-replay.cpp - Replay OpenMP offload kernel ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a command line utility to replay the execution of recorded OpenMP
+// offload kernels.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptargetplugin.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+cl::OptionCategory ReplayOptions("llvm-omp-kernel-replay Options");
+
+// InputFilename - The filename to read the json description of the kernel.
+static cl::opt<std::string> InputFilename(cl::Positional,
+                                          cl::desc("<input kernel json file>"),
+                                          cl::Required);
+
+static cl::opt<bool> VerifyOpt(
+    "verify",
+    cl::desc(
+        "Verify device memory post execution against the original output."),
+    cl::init(false), cl::cat(ReplayOptions));
+
+static cl::opt<bool> SaveOutputOpt(
+    "save-output",
+    cl::desc("Save the device memory output of the replayed kernel execution."),
+    cl::init(false), cl::cat(ReplayOptions));
+
+static cl::opt<unsigned> NumTeamsOpt("num-teams",
+                                     cl::desc("Set the number of teams."),
+                                     cl::init(0), cl::cat(ReplayOptions));
+
+static cl::opt<unsigned> NumThreadsOpt("num-threads",
+                                       cl::desc("Set the number of threads."),
+                                       cl::init(0), cl::cat(ReplayOptions));
+
+static cl::opt<int32_t> DeviceIdOpt("device-id", cl::desc("Set the device id."),
+                                    cl::init(-1), cl::cat(ReplayOptions));
+
+int main(int argc, char **argv) {
+  cl::HideUnrelatedOptions(ReplayOptions);
+  cl::ParseCommandLineOptions(argc, argv, "llvm-omp-kernel-replay\n");
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> KernelInfoMB =
+      MemoryBuffer::getFile(InputFilename, /* isText */ true,
+                            /* RequiresNullTerminator */ true);
+  if (!KernelInfoMB)
+    report_fatal_error("Error reading the kernel info json file");
+  Expected<json::Value> JsonKernelInfo =
+      json::parse(KernelInfoMB.get()->getBuffer());
+  if (auto Err = JsonKernelInfo.takeError())
+    report_fatal_error("Cannot parse the kernel info json file");
+
+  auto NumTeamsJson =
+      JsonKernelInfo->getAsObject()->getInteger("NumTeamsClause");
+  unsigned NumTeams = (NumTeamsOpt > 0 ? NumTeamsOpt : NumTeamsJson.value());
+  auto NumThreadsJson =
+      JsonKernelInfo->getAsObject()->getInteger("ThreadLimitClause");
+  unsigned NumThreads =
+      (NumThreadsOpt > 0 ? NumThreadsOpt : NumThreadsJson.value());
+  // TODO: Print a warning if number of teams/threads is explicitly set in the
+  // kernel info but overriden through command line options.
+  auto LoopTripCount =
+      JsonKernelInfo->getAsObject()->getInteger("LoopTripCount");
+  auto KernelFunc = JsonKernelInfo->getAsObject()->getString("Name");
+
+  SmallVector<void *> TgtArgs;
+  SmallVector<ptr
diff _t> TgtArgOffsets;
+  auto NumArgs = JsonKernelInfo->getAsObject()->getInteger("NumArgs");
+  auto *TgtArgsArray = JsonKernelInfo->getAsObject()->getArray("ArgPtrs");
+  for (auto It : *TgtArgsArray)
+    TgtArgs.push_back(reinterpret_cast<void *>(It.getAsInteger().value()));
+  auto *TgtArgOffsetsArray =
+      JsonKernelInfo->getAsObject()->getArray("ArgOffsets");
+  for (auto It : *TgtArgOffsetsArray)
+    TgtArgOffsets.push_back(
+        reinterpret_cast<ptr
diff _t>(It.getAsInteger().value()));
+
+  __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0};
+  std::string KernelEntryName = KernelFunc.value().str();
+  KernelEntry.name = const_cast<char *>(KernelEntryName.c_str());
+  // Anything non-zero works to uniquely identify the kernel.
+  KernelEntry.addr = (void *)0x1;
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> ImageMB =
+      MemoryBuffer::getFile(KernelEntryName + ".image", /* isText */ false,
+                            /* RequiresNullTerminator */ false);
+  if (!ImageMB)
+    report_fatal_error("Error reading the kernel image.");
+
+  __tgt_device_image DeviceImage;
+  DeviceImage.ImageStart = (void *)ImageMB.get()->getBufferStart();
+  DeviceImage.ImageEnd = (void *)ImageMB.get()->getBufferEnd();
+  DeviceImage.EntriesBegin = &KernelEntry;
+  DeviceImage.EntriesEnd = &KernelEntry + 1;
+
+  __tgt_bin_desc Desc;
+  Desc.NumDeviceImages = 1;
+  Desc.HostEntriesBegin = &KernelEntry;
+  Desc.HostEntriesEnd = &KernelEntry + 1;
+  Desc.DeviceImages = &DeviceImage;
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> DeviceMemoryMB =
+      MemoryBuffer::getFile(KernelEntryName + ".memory", /* isText */ false,
+                            /* RequiresNullTerminator */ false);
+  if (!DeviceMemoryMB)
+    report_fatal_error("Error reading the kernel input device memory.");
+
+  setenv("LIBOMPTARGET_REPLAY", "1", 1);
+  if (VerifyOpt || SaveOutputOpt)
+    setenv("LIBOMPTARGET_RR_SAVE_OUTPUT", "1", 1);
+
+  auto DeviceMemorySizeJson =
+      JsonKernelInfo->getAsObject()->getInteger("DeviceMemorySize");
+  // Set device memory size to the ceiling of GB granularity.
+  uint64_t DeviceMemorySize =
+      std::ceil(DeviceMemorySizeJson.value() / (1024.0 * 1024.0 * 1024.0));
+
+  setenv("LIBOMPTARGET_RR_DEVMEM_SIZE",
+         std::to_string(DeviceMemorySize).c_str(), 1);
+
+  auto DeviceIdJson = JsonKernelInfo->getAsObject()->getInteger("DeviceId");
+  // TODO: Print warning if the user overrides the device id in the json file.
+  int32_t DeviceId = (DeviceIdOpt > -1 ? DeviceIdOpt : DeviceIdJson.value());
+
+  // TODO: do we need requires?
+  //__tgt_register_requires(/* Flags */1);
+
+  __tgt_init_all_rtls();
+
+  __tgt_register_lib(&Desc);
+
+  __tgt_target_kernel_replay(
+      /* Loc */ nullptr, DeviceId, KernelEntry.addr,
+      (void *)DeviceMemoryMB.get()->getBuffer().data(),
+      DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(),
+      TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads,
+      LoopTripCount.value());
+
+  if (VerifyOpt) {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> OriginalOutputMB =
+        MemoryBuffer::getFile(KernelEntryName + ".original.output",
+                              /* isText */ false,
+                              /* RequiresNullTerminator */ false);
+    if (!OriginalOutputMB)
+      report_fatal_error("Error reading the kernel original output file, make "
+                         "sure LIBOMPTARGET_SAVE_OUTPUT is set when recording");
+    ErrorOr<std::unique_ptr<MemoryBuffer>> ReplayOutputMB =
+        MemoryBuffer::getFile(KernelEntryName + ".replay.output",
+                              /* isText */ false,
+                              /* RequiresNullTerminator */ false);
+    if (!ReplayOutputMB)
+      report_fatal_error("Error reading the kernel replay output file");
+
+    StringRef OriginalOutput = OriginalOutputMB.get()->getBuffer();
+    StringRef ReplayOutput = ReplayOutputMB.get()->getBuffer();
+    if (OriginalOutput == ReplayOutput)
+      outs() << "[llvm-omp-kernel-replay] Replay device memory verified!\n";
+    else
+      outs() << "[llvm-omp-kernel-replay] Replay device memory failed to "
+                "verify!\n";
+  }
+  // TODO: calling unregister lib causes plugin deinit error for nextgen
+  // plugins.
+  //__tgt_unregister_lib(&Desc);
+
+  return 0;
+}


        


More information about the Openmp-commits mailing list