[Openmp-commits] [openmp] 73cb01d - [OpenMP] Support for OpenMP-Offload Record Replay

Konstantinos Parasyris via Openmp-commits openmp-commits at lists.llvm.org
Sat Aug 5 00:46:56 PDT 2023


Author: koparasy
Date: 2023-08-05T00:46:06-07:00
New Revision: 73cb01dc8a2f5ea36463798019e0c2d17e3b61b9

URL: https://github.com/llvm/llvm-project/commit/73cb01dc8a2f5ea36463798019e0c2d17e3b61b9
DIFF: https://github.com/llvm/llvm-project/commit/73cb01dc8a2f5ea36463798019e0c2d17e3b61b9.diff

LOG: [OpenMP] Support for OpenMP-Offload Record Replay

Enable record-replay for OpenMP offload kernels.  On recording the initialization
is performed on device initialization by reading env variables. (This is similar to
the way rr used to operate). The primary change takes place in the replay phase
with the replay tool explicitly initializing the record-replay functionality.

Differential Revision: https://reviews.llvm.org/D156174

Fix

Added: 
    

Modified: 
    openmp/libomptarget/include/omptarget.h
    openmp/libomptarget/include/rtl.h
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
    openmp/libomptarget/src/device.cpp
    openmp/libomptarget/src/exports
    openmp/libomptarget/src/interface.cpp
    openmp/libomptarget/src/omptarget.cpp
    openmp/libomptarget/src/private.h
    openmp/libomptarget/src/rtl.cpp
    openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 4aab729190b9c4..f05c4015da5f2c 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -433,6 +433,10 @@ int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
 void __tgt_set_info_flag(uint32_t);
 
 int __tgt_print_device_info(int64_t DeviceId);
+
+int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
+                                 bool IsRecord, bool SaveOutput);
+
 #ifdef __cplusplus
 }
 #endif

diff  --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h
index 145f05585a17d8..29746b6a47ea75 100644
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@@ -72,6 +72,7 @@ struct RTLInfoTy {
   typedef int32_t(data_unlock_ty)(int32_t, void *);
   typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t);
   typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
+  typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool);
 
   int32_t Idx = -1;             // RTL index, index is the number of devices
                                 // of other RTLs that were registered before,
@@ -124,6 +125,7 @@ struct RTLInfoTy {
   data_unlock_ty *data_unlock = nullptr;
   data_notify_mapped_ty *data_notify_mapped = nullptr;
   data_notify_unmapped_ty *data_notify_unmapped = nullptr;
+  activate_record_replay_ty *activate_record_replay = nullptr;
 
   // Are there images associated with this RTL.
   bool IsUsed = false;

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp
index ff19f4aaefc46f..657996ff112aaa 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp
@@ -358,5 +358,7 @@ bool JITEngine::checkBitcodeImage(const __tgt_device_image &Image) {
   auto BitcodeTA = Triple(ActualTriple).getArch();
   BitcodeImageMap[Image.ImageStart] = BitcodeTA;
 
+  DP("Is%s IR Image\n", BitcodeTA == TT.getArch() ? " " : " NOT");
+
   return BitcodeTA == TT.getArch();
 }

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 514c2e2e1fe148..d54982ea447091 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -39,6 +39,10 @@ GenericPluginTy *Plugin::SpecificPlugin = nullptr;
 
 // TODO: Fix any thread safety issues for multi-threaded kernel recording.
 struct RecordReplayTy {
+
+  // Describes the state of the record replay mechanism.
+  enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying };
+
 private:
   // Memory pointers for recording, replaying memory.
   void *MemoryStart;
@@ -47,26 +51,19 @@ struct RecordReplayTy {
   GenericDeviceTy *Device;
   std::mutex AllocationLock;
 
-  // Environment variables for record and replay.
-  // Enables recording kernels if set.
-  BoolEnvar OMPX_RecordKernel;
-  // Enables replaying a kernel if set.
-  BoolEnvar OMPX_ReplayKernel;
-  // Enables saving the device memory kernel output post execution if set.
-  BoolEnvar OMPX_ReplaySaveOutput;
-  // Sets the maximum to pre-allocate device memory.
-  UInt32Envar OMPX_DeviceMemorySize;
+  RRStatusTy Status;
+  bool ReplaySaveOutput;
+  uint64_t DeviceMemorySize;
 
   // Record/replay pre-allocates the largest possible device memory using the
   // default kind.
   // TODO: Expand allocation to include other kinds (device, host, shared) and
   // possibly use a MemoryManager to track (de-)allocations for
   // storing/retrieving when recording/replaying.
-  Error preallocateDeviceMemory() {
+  Error preallocateDeviceMemory(uint64_t DeviceMemorySize) {
     // Pre-allocate memory on device. Starts with 64GB and subtracts in steps
     // of 1GB until allocation succeeds.
-    const size_t MAX_MEMORY_ALLOCATION =
-        OMPX_DeviceMemorySize * 1024 * 1024 * 1024ULL;
+    const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize;
     constexpr size_t STEP = 1024 * 1024 * 1024ULL;
     MemoryStart = nullptr;
     for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
@@ -85,15 +82,14 @@ struct RecordReplayTy {
     return Plugin::success();
   }
 
-  void dumpDeviceMemory(StringRef Filename,
-                        AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  void dumpDeviceMemory(StringRef Filename) {
     ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
         WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
     if (!DeviceMemoryMB)
       report_fatal_error("Error creating MemoryBuffer for device memory");
 
     auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
-                                    MemoryStart, MemorySize, AsyncInfoWrapper);
+                                    MemoryStart, MemorySize, nullptr);
     if (Err)
       report_fatal_error("Error retrieving data for target pointer");
 
@@ -108,21 +104,19 @@ struct RecordReplayTy {
   }
 
 public:
-  bool isRecording() const { return OMPX_RecordKernel; }
-  bool isReplaying() const { return OMPX_ReplayKernel; }
+  bool isRecording() const { return Status == RRStatusTy::RRRecording; }
+  bool isReplaying() const { return Status == RRStatusTy::RRReplaying; }
   bool isRecordingOrReplaying() const {
-    return (OMPX_RecordKernel || OMPX_ReplayKernel);
+    return (Status != RRStatusTy::RRDeactivated);
   }
-  bool isSaveOutputEnabled() const { return OMPX_ReplaySaveOutput; }
+  void setStatus(RRStatusTy Status) { this->Status = Status; }
+  bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
 
   RecordReplayTy()
-      : OMPX_RecordKernel("LIBOMPTARGET_RECORD"),
-        OMPX_ReplayKernel("LIBOMPTARGET_REPLAY"),
-        OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT"),
-        OMPX_DeviceMemorySize("LIBOMPTARGET_RR_DEVMEM_SIZE",
-                              /* Default in GB */ 64) {}
+      : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false),
+        DeviceMemorySize(-1) {}
 
-  void saveImage(const char *Name, DeviceImageTy &Image) {
+  void saveImage(const char *Name, const DeviceImageTy &Image) {
     SmallString<128> ImageName = {Name, ".image"};
     std::error_code EC;
     raw_fd_ostream OS(ImageName, EC);
@@ -140,11 +134,60 @@ struct RecordReplayTy {
     OS.close();
   }
 
-  void saveKernelInputInfo(const char *Name, void **ArgPtrs,
-                           ptr
diff _t *ArgOffsets, int32_t NumArgs,
-                           uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
-                           uint64_t LoopTripCount,
-                           AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
+    int32_t Size = 0;
+
+    for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
+      if (!OffloadEntry.size)
+        continue;
+      Size += std::strlen(OffloadEntry.name) + /* '\0' */ 1 +
+              /* OffloadEntry.size value */ sizeof(uint32_t) +
+              OffloadEntry.size;
+    }
+
+    ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
+        WritableMemoryBuffer::getNewUninitMemBuffer(Size);
+    if (!GlobalsMB)
+      report_fatal_error("Error creating MemoryBuffer for globals memory");
+
+    void *BufferPtr = GlobalsMB.get()->getBufferStart();
+    for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
+      if (!OffloadEntry.size)
+        continue;
+
+      int32_t NameLength = std::strlen(OffloadEntry.name) + 1;
+      memcpy(BufferPtr, OffloadEntry.name, NameLength);
+      BufferPtr = advanceVoidPtr(BufferPtr, NameLength);
+
+      *((uint32_t *)(BufferPtr)) = OffloadEntry.size;
+      BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t));
+
+      auto Err = Plugin::success();
+      {
+        if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.addr,
+                                            OffloadEntry.size, nullptr))
+          report_fatal_error("Error retrieving data for global");
+      }
+      if (Err)
+        report_fatal_error("Error retrieving data for global");
+      BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.size);
+    }
+    assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
+           "Buffer over/under-filled.");
+    assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) &&
+           "Buffer size mismatch");
+
+    StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
+    std::error_code EC;
+    raw_fd_ostream OS(Filename, EC);
+    OS << GlobalsMemory;
+    OS.close();
+  }
+
+  void saveKernelInputInfo(const char *Name, DeviceImageTy &Image,
+                           void **ArgPtrs, ptr
diff _t *ArgOffsets,
+                           int32_t NumArgs, uint64_t NumTeamsClause,
+                           uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
     json::Object JsonKernelInfo;
     JsonKernelInfo["Name"] = Name;
     JsonKernelInfo["NumArgs"] = NumArgs;
@@ -165,7 +208,10 @@ struct RecordReplayTy {
     JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
 
     SmallString<128> MemoryFilename = {Name, ".memory"};
-    dumpDeviceMemory(MemoryFilename, AsyncInfoWrapper);
+    dumpDeviceMemory(MemoryFilename);
+
+    SmallString<128> GlobalsFilename = {Name, ".globals"};
+    dumpGlobals(GlobalsFilename, Image);
 
     SmallString<128> JsonFilename = {Name, ".json"};
     std::error_code EC;
@@ -177,11 +223,10 @@ struct RecordReplayTy {
     JsonOS.close();
   }
 
-  void saveKernelOutputInfo(const char *Name,
-                            AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  void saveKernelOutputInfo(const char *Name) {
     SmallString<128> OutputFilename = {
         Name, (isRecording() ? ".original.output" : ".replay.output")};
-    dumpDeviceMemory(OutputFilename, AsyncInfoWrapper);
+    dumpDeviceMemory(OutputFilename);
   }
 
   void *alloc(uint64_t Size) {
@@ -194,12 +239,28 @@ struct RecordReplayTy {
     Alloc = MemoryPtr;
     MemoryPtr = (char *)MemoryPtr + AlignedSize;
     MemorySize += AlignedSize;
+    DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc));
     return Alloc;
   }
 
-  Error init(GenericDeviceTy *Device) {
+  Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status,
+             bool SaveOutput) {
     this->Device = Device;
-    return preallocateDeviceMemory();
+    this->Status = Status;
+    this->DeviceMemorySize = MemSize;
+    this->ReplaySaveOutput = SaveOutput;
+
+    if (auto Err = preallocateDeviceMemory(MemSize))
+      return Err;
+
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
+         "Record Replay Initialized (%p)"
+         " as starting address, %lu Memory Size"
+         " and set on status %s\n",
+         MemoryStart, MemSize,
+         Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
+
+    return Plugin::success();
   }
 
   void deinit() { Device->free(MemoryStart); }
@@ -227,7 +288,11 @@ void AsyncInfoWrapperTy::finalize(Error &Err) {
 
 Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
                             DeviceImageTy &Image) {
+
+  ImagePtr = &Image;
+
   PreferredNumThreads = GenericDevice.getDefaultNumThreads();
+
   MaxNumThreads = GenericDevice.getThreadLimit();
 
   return initImpl(GenericDevice, Image);
@@ -468,10 +533,6 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
   if (EnableMM)
     MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
 
-  if (RecordReplay.isRecordingOrReplaying())
-    if (auto Err = RecordReplay.init(this))
-      return Err;
-
   return Plugin::success();
 }
 
@@ -1087,26 +1148,31 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
                                     ptr
diff _t *ArgOffsets,
                                     KernelArgsTy &KernelArgs,
                                     __tgt_async_info *AsyncInfo) {
-  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+  AsyncInfoWrapperTy AsyncInfoWrapper(
+      *this, RecordReplay.isRecordingOrReplaying() ? nullptr : AsyncInfo);
 
   GenericKernelTy &GenericKernel =
       *reinterpret_cast<GenericKernelTy *>(EntryPtr);
 
   if (RecordReplay.isRecording())
     RecordReplay.saveKernelInputInfo(
-        GenericKernel.getName(), ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
-        KernelArgs.NumTeams[0], KernelArgs.ThreadLimit[0], KernelArgs.Tripcount,
-        AsyncInfoWrapper);
+        GenericKernel.getName(), GenericKernel.getImage(), ArgPtrs, ArgOffsets,
+        KernelArgs.NumArgs, KernelArgs.NumTeams[0], KernelArgs.ThreadLimit[0],
+        KernelArgs.Tripcount);
+
+  if (RecordReplay.isRecording())
+    RecordReplay.saveImage(GenericKernel.getName(), GenericKernel.getImage());
 
   auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
                                   AsyncInfoWrapper);
 
+  // 'finalize' here to guarantee next record-replay actions are in-sync
+  AsyncInfoWrapper.finalize(Err);
+
   if (RecordReplay.isRecordingOrReplaying() &&
       RecordReplay.isSaveOutputEnabled())
-    RecordReplay.saveKernelOutputInfo(GenericKernel.getName(),
-                                      AsyncInfoWrapper);
+    RecordReplay.saveKernelOutputInfo(GenericKernel.getName());
 
-  AsyncInfoWrapper.finalize(Err);
   return Err;
 }
 
@@ -1358,6 +1424,28 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
   return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
 }
 
+int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
+                                           uint64_t MemorySize, bool isRecord,
+                                           bool SaveOutput) {
+  GenericPluginTy &Plugin = Plugin::get();
+  GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
+  RecordReplayTy::RRStatusTy Status =
+      isRecord ? RecordReplayTy::RRStatusTy::RRRecording
+               : RecordReplayTy::RRStatusTy::RRReplaying;
+
+  if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) {
+    REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
+           "(Error: %s)\n",
+           MemorySize, toString(std::move(Err)).data());
+    RecordReplay.setStatus(RecordReplayTy::RRStatusTy::RRDeactivated);
+
+    if (!isRecord) {
+      return OFFLOAD_FAIL;
+    }
+  }
+  return OFFLOAD_SUCCESS;
+}
+
 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
                                           __tgt_device_image *TgtImage) {
   GenericPluginTy &Plugin = Plugin::get();

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 740f4023933bcd..9521323eaf3fbd 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -194,6 +194,11 @@ class DeviceImageTy {
   private:
     __tgt_target_table TTTablePtr;
     llvm::SmallVector<__tgt_offload_entry> Entries;
+
+  public:
+    using const_iterator = decltype(Entries)::const_iterator;
+    const_iterator begin() const { return Entries.begin(); }
+    const_iterator end() const { return Entries.end(); }
   };
 
   /// Image identifier within the corresponding device. Notice that this id is
@@ -274,6 +279,12 @@ struct GenericKernelTy {
   /// Get the kernel name.
   const char *getName() const { return Name; }
 
+  /// Get the kernel image.
+  DeviceImageTy &getImage() const {
+    assert(ImagePtr && "Kernel is not initialized!");
+    return *ImagePtr;
+  }
+
   /// Indicate whether an execution mode is valid.
   static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
     switch (ExecutionMode) {
@@ -343,6 +354,9 @@ struct GenericKernelTy {
   /// The execution flags of the kernel.
   OMPTgtExecModeFlags ExecutionMode;
 
+  /// The image that contains this kernel.
+  DeviceImageTy *ImagePtr = nullptr;
+
 protected:
   /// The preferred number of threads to run the kernel.
   uint32_t PreferredNumThreads;

diff  --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 276b7c9f499c50..cb3d49913086f5 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -17,6 +17,8 @@
 #include "private.h"
 #include "rtl.h"
 
+#include "Utilities.h"
+
 #include <cassert>
 #include <climits>
 #include <cstdint>
@@ -530,6 +532,23 @@ void DeviceTy::init() {
   if (Ret != OFFLOAD_SUCCESS)
     return;
 
+  // Enables recording kernels if set.
+  llvm::omp::target::BoolEnvar OMPX_RecordKernel("LIBOMPTARGET_RECORD", false);
+  if (OMPX_RecordKernel) {
+    // Enables saving the device memory kernel output post execution if set.
+    llvm::omp::target::BoolEnvar OMPX_ReplaySaveOutput(
+        "LIBOMPTARGET_RR_SAVE_OUTPUT", false);
+    // Sets the maximum to pre-allocate device memory.
+    llvm::omp::target::UInt64Envar OMPX_DeviceMemorySize(
+        "LIBOMPTARGET_RR_DEVMEM_SIZE", 16);
+    DP("Activating Record-Replay for Device %d with %lu GB memory\n",
+       RTLDeviceID, OMPX_DeviceMemorySize);
+
+    RTL->activate_record_replay(RTLDeviceID,
+                                OMPX_DeviceMemorySize * 1024 * 1024 * 1024,
+                                true, OMPX_ReplaySaveOutput);
+  }
+
   IsInit = true;
 }
 

diff  --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index 48591dd6c3faff..c29c8d03fb1276 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -28,6 +28,7 @@ VERS1.0 {
     __tgt_target_kernel_nowait;
     __tgt_target_nowait_query;
     __tgt_target_kernel_replay;
+    __tgt_activate_record_replay;
     __tgt_mapper_num_components;
     __tgt_push_mapper_component;
     __kmpc_push_target_tripcount;

diff  --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 7fb72e16088ced..05de3af4a58380 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -334,6 +334,28 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                      HostPtr, KernelArgs);
 }
 
+/// Activates the record replay mechanism.
+/// \param DeviceId The device identifier to execute the target region.
+/// \param MemorySize The number of bytes to be (pre-)allocated
+///                   by the bump allocator
+/// /param IsRecord Activates the record replay mechanism in
+///                 'record' mode or 'replay' mode.
+/// /param SaveOutput Store the device memory after kernel
+///                   execution on persistent storage
+EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
+                                        bool IsRecord, bool SaveOutput) {
+  if (!deviceIsReady(DeviceId)) {
+    DP("Device %" PRId64 " is not ready\n", DeviceId);
+    return OMP_TGT_FAIL;
+  }
+
+  DeviceTy &Device = *PM->Devices[DeviceId];
+  int Rc = target_activate_rr(Device, MemorySize, IsRecord, SaveOutput);
+  assert(Rc == OFFLOAD_SUCCESS &&
+         "__tgt_activate_record_replay unexpected failure!");
+  return OMP_TGT_SUCCESS;
+};
+
 /// Implements a target kernel entry that replays a pre-recorded kernel.
 /// \param Loc Source location associated with this target region (unused).
 /// \param DeviceId The device identifier to execute the target region.

diff  --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 0719f1b475f16d..2839176a184c6b 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -1712,6 +1712,15 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
   return OFFLOAD_SUCCESS;
 }
 
+/// Enables the record replay mechanism by pre-allocating MemorySize
+/// and informing the record-replayer of whether to store the output
+/// in some file.
+int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, bool isRecord,
+                       bool SaveOutput) {
+  return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize,
+                                            isRecord, SaveOutput);
+}
+
 /// Executes a kernel using pre-recorded information for loading to
 /// device memory to launch the target kernel with the pre-recorded
 /// configuration.

diff  --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 14dafef09a6334..876f055653b206 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -41,6 +41,9 @@ extern int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
 extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
                   KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
 
+extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
+                              bool isRecord, bool SaveOutput);
+
 extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
                          void *DeviceMemory, int64_t DeviceMemorySize,
                          void **TgtArgs, ptr
diff _t *TgtOffsets, int32_t NumArgs,

diff  --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index 6a73f6e2897366..ed3e86075f8582 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -250,6 +250,10 @@ bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) {
   *((void **)&RTL.data_notify_unmapped) =
       DynLibrary->getAddressOfSymbol("__tgt_rtl_data_notify_unmapped");
 
+  // Record Replay RTL
+  *((void **)&RTL.activate_record_replay) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_initialize_record_replay");
+
   RTL.LibraryHandler = std::move(DynLibrary);
 
   // Successfully loaded

diff  --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 4b4b168acdcc79..57773900e1e1b4 100644
--- a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "omptarget.h"
 #include "omptargetplugin.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/JSON.h"
@@ -111,24 +112,10 @@ int main(int argc, char **argv) {
   Desc.HostEntriesEnd = &KernelEntry + 1;
   Desc.DeviceImages = &DeviceImage;
 
-  ErrorOr<std::unique_ptr<MemoryBuffer>> DeviceMemoryMB =
-      MemoryBuffer::getFile(KernelEntryName + ".memory", /* isText */ false,
-                            /* RequiresNullTerminator */ false);
-  if (!DeviceMemoryMB)
-    report_fatal_error("Error reading the kernel input device memory.");
-
-  setenv("LIBOMPTARGET_REPLAY", "1", 1);
-  if (VerifyOpt || SaveOutputOpt)
-    setenv("LIBOMPTARGET_RR_SAVE_OUTPUT", "1", 1);
-
   auto DeviceMemorySizeJson =
       JsonKernelInfo->getAsObject()->getInteger("DeviceMemorySize");
   // Set device memory size to the ceiling of GB granularity.
-  uint64_t DeviceMemorySize =
-      std::ceil(DeviceMemorySizeJson.value() / (1024.0 * 1024.0 * 1024.0));
-
-  setenv("LIBOMPTARGET_RR_DEVMEM_SIZE",
-         std::to_string(DeviceMemorySize).c_str(), 1);
+  uint64_t DeviceMemorySize = std::ceil(DeviceMemorySizeJson.value());
 
   auto DeviceIdJson = JsonKernelInfo->getAsObject()->getInteger("DeviceId");
   // TODO: Print warning if the user overrides the device id in the json file.
@@ -137,13 +124,31 @@ int main(int argc, char **argv) {
   // TODO: do we need requires?
   //__tgt_register_requires(/* Flags */1);
 
-  __tgt_init_all_rtls();
-
   __tgt_register_lib(&Desc);
 
+  int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, false,
+                                        VerifyOpt);
+
+  if (Rc != OMP_TGT_SUCCESS) {
+    report_fatal_error("Cannot activate record replay\n");
+  }
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> DeviceMemoryMB =
+      MemoryBuffer::getFile(KernelEntryName + ".memory", /* isText */ false,
+                            /* RequiresNullTerminator */ false);
+
+  if (!DeviceMemoryMB)
+    report_fatal_error("Error reading the kernel input device memory.");
+
+  // On AMD for currently unknown reasons we cannot copy memory mapped data to
+  // device. This is a work-around.
+  uint8_t *recored_data = new uint8_t[DeviceMemoryMB.get()->getBufferSize()];
+  std::memcpy(recored_data,
+              const_cast<char *>(DeviceMemoryMB.get()->getBuffer().data()),
+              DeviceMemorySizeJson.value() * sizeof(uint8_t));
+
   __tgt_target_kernel_replay(
-      /* Loc */ nullptr, DeviceId, KernelEntry.addr,
-      const_cast<char *>(DeviceMemoryMB.get()->getBuffer().data()),
+      /* Loc */ nullptr, DeviceId, KernelEntry.addr, (char *)recored_data,
       DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(),
       TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads,
       LoopTripCount.value());
@@ -171,6 +176,9 @@ int main(int argc, char **argv) {
       outs() << "[llvm-omp-kernel-replay] Replay device memory failed to "
                 "verify!\n";
   }
+
+  delete[] recored_data;
+
   // TODO: calling unregister lib causes plugin deinit error for nextgen
   // plugins.
   //__tgt_unregister_lib(&Desc);


        


More information about the Openmp-commits mailing list