[Openmp-commits] [openmp] f48c4d8 - [OpenMP] Be more forgiving during record and replay

Johannes Doerfert via Openmp-commits openmp-commits at lists.llvm.org
Mon Nov 20 17:24:36 PST 2023


Author: Johannes Doerfert
Date: 2023-11-20T17:15:34-08:00
New Revision: f48c4d8aa1e46bcaf3204a87e3e62716cf35b47b

URL: https://github.com/llvm/llvm-project/commit/f48c4d8aa1e46bcaf3204a87e3e62716cf35b47b
DIFF: https://github.com/llvm/llvm-project/commit/f48c4d8aa1e46bcaf3204a87e3e62716cf35b47b.diff

LOG: [OpenMP] Be more forgiving during record and replay

When we record and replay kernels we should not error out early if there
is a chance the program might still run fine. This patch will:
1) Fallback to the allocation heuristic if the VAMap doesn't work.
2) Adjust the memory start to match the required address if possible.
3) Adjust the (guessed) pointer arguments if the memory start adjustment
   is impossible. This will allow kernels without indirect accesses to
   work while indirect accesses will most likely fail.

Added: 
    

Modified: 
    openmp/libomptarget/include/omptarget.h
    openmp/libomptarget/include/rtl.h
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
    openmp/libomptarget/src/device.cpp
    openmp/libomptarget/src/interface.cpp
    openmp/libomptarget/src/omptarget.cpp
    openmp/libomptarget/src/private.h
    openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 818967c88904ec0..19e072abc402e38 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -450,7 +450,8 @@ void __tgt_set_info_flag(uint32_t);
 int __tgt_print_device_info(int64_t DeviceId);
 
 int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
-                                 void *VAddr, bool IsRecord, bool SaveOutput);
+                                 void *VAddr, bool IsRecord, bool SaveOutput,
+                                 uint64_t &ReqPtrArgOffset);
 
 #ifdef __cplusplus
 }

diff  --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h
index 2272577684f0c6c..0c751cd36bfd2d0 100644
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@@ -20,6 +20,7 @@
 
 #include "omptarget.h"
 
+#include <cstdint>
 #include <list>
 #include <map>
 #include <mutex>
@@ -74,7 +75,7 @@ struct RTLInfoTy {
   typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
   typedef int32_t(set_device_offset_ty)(int32_t);
   typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool,
-                                             bool);
+                                             bool, uint64_t &);
 
   int32_t Idx = -1;             // RTL index, index is the number of devices
                                 // of other RTLs that were registered before,

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 08946e21035014e..fb4db4adf0a367e 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -55,6 +55,8 @@ struct RecordReplayTy {
 
   RRStatusTy Status;
   bool ReplaySaveOutput;
+  bool UsedVAMap = false;
+  uintptr_t MemoryOffset = 0;
 
   void *suggestAddress(uint64_t MaxMemoryAllocation) {
     // Get a valid pointer address for this system
@@ -89,10 +91,12 @@ struct RecordReplayTy {
     MemoryPtr = MemoryStart;
     MemorySize = 0;
     TotalSize = ASize;
+    UsedVAMap = true;
     return Plugin::success();
   }
 
-  Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) {
+  Error preAllocateHeuristic(uint64_t MaxMemoryAllocation,
+                             uint64_t RequiredMemoryAllocation, void *VAddr) {
     const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
     constexpr size_t STEP = 1024 * 1024 * 1024ULL;
     MemoryStart = nullptr;
@@ -102,32 +106,55 @@ struct RecordReplayTy {
       if (MemoryStart)
         break;
     }
-
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
-         "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
-         MemoryStart);
-
     if (!MemoryStart)
       return Plugin::error("Allocating record/replay memory");
 
     if (VAddr && VAddr != MemoryStart)
-      return Plugin::error("Cannot allocate recorded address");
+      MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart);
 
     MemoryPtr = MemoryStart;
     MemorySize = 0;
 
+    // Check if we need adjustment.
+    if (MemoryOffset > 0 &&
+        TotalSize >= RequiredMemoryAllocation + MemoryOffset) {
+      // If we are off but "before" the required address and with enough space,
+      // we just "allocate" the offset to match the required address.
+      MemoryPtr = (char *)MemoryPtr + MemoryOffset;
+      MemorySize += MemoryOffset;
+      MemoryOffset = 0;
+      assert(MemoryPtr == VAddr && "Expected offset adjustment to work");
+    } else if (MemoryOffset) {
+      // If we are off and in a situation we cannot just "waste" memory to force
+      // a match, we hope adjusting the arguments is sufficient.
+      REPORT(
+          "WARNING Failed to allocate replay memory at required location %p, "
+          "got %p, trying to offset argument pointers by %" PRIi64 "\n",
+          VAddr, MemoryStart, MemoryOffset);
+    }
+
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
+         "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
+         MemoryStart);
+
     return Plugin::success();
   }
 
   Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
-    if (Device->supportVAManagement())
-      return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
+    if (Device->supportVAManagement()) {
+      auto Err = preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
+      if (Err) {
+        REPORT("WARNING VA mapping failed, fallback to heuristic: "
+               "(Error: %s)\n",
+               toString(std::move(Err)).data());
+      }
+    }
 
     uint64_t DevMemSize;
     if (Device->getDeviceMemorySize(DevMemSize))
       return Plugin::error("Cannot determine Device Memory Size");
 
-    return preAllocateHeuristic(DevMemSize, ReqVAddr);
+    return preAllocateHeuristic(DevMemSize, DeviceMemorySize, ReqVAddr);
   }
 
   void dumpDeviceMemory(StringRef Filename) {
@@ -293,7 +320,7 @@ struct RecordReplayTy {
   }
 
   Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
-             RRStatusTy Status, bool SaveOutput) {
+             RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) {
     this->Device = Device;
     this->Status = Status;
     this->ReplaySaveOutput = SaveOutput;
@@ -308,11 +335,14 @@ struct RecordReplayTy {
          MemoryStart, TotalSize,
          Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
 
+    // Tell the user to offset pointer arguments as the memory allocation does
+    // not match.
+    ReqPtrArgOffset = MemoryOffset;
     return Plugin::success();
   }
 
   void deinit() {
-    if (Device->supportVAManagement()) {
+    if (UsedVAMap) {
       if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
         report_fatal_error("Error on releasing virtual memory space");
     } else {
@@ -1694,15 +1724,16 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
 
 int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
                                            void *VAddr, bool isRecord,
-                                           bool SaveOutput) {
+                                           bool SaveOutput,
+                                           uint64_t &ReqPtrArgOffset) {
   GenericPluginTy &Plugin = Plugin::get();
   GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
   RecordReplayTy::RRStatusTy Status =
       isRecord ? RecordReplayTy::RRStatusTy::RRRecording
                : RecordReplayTy::RRStatusTy::RRReplaying;
 
-  if (auto Err =
-          RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
+  if (auto Err = RecordReplay.init(&Device, MemorySize, VAddr, Status,
+                                   SaveOutput, ReqPtrArgOffset)) {
     REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
            "(Error: %s)\n",
            MemorySize, toString(std::move(Err)).data());

diff  --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 8a2fe4620b39cbe..da167845ccb06c4 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -539,15 +539,10 @@ void DeviceTy::init() {
     // Enables saving the device memory kernel output post execution if set.
     llvm::omp::target::BoolEnvar OMPX_ReplaySaveOutput(
         "LIBOMPTARGET_RR_SAVE_OUTPUT", false);
-    // Sets the maximum to pre-allocate device memory.
-    llvm::omp::target::UInt64Envar OMPX_DeviceMemorySize(
-        "LIBOMPTARGET_RR_DEVMEM_SIZE", 16);
-    DP("Activating Record-Replay for Device %d with %lu GB memory\n",
-       RTLDeviceID, OMPX_DeviceMemorySize.get());
-
-    RTL->activate_record_replay(RTLDeviceID,
-                                OMPX_DeviceMemorySize * 1024 * 1024 * 1024,
-                                nullptr, true, OMPX_ReplaySaveOutput);
+
+    uint64_t ReqPtrArgOffset;
+    RTL->activate_record_replay(RTLDeviceID, 0, nullptr, true,
+                                OMPX_ReplaySaveOutput, ReqPtrArgOffset);
   }
 
   IsInit = true;

diff  --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index e9ab7f05c7a0a76..1e6bfec012f3d59 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -21,6 +21,7 @@
 #include "Utilities.h"
 
 #include <cassert>
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <mutex>
@@ -347,15 +348,16 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
 ///                   execution on persistent storage
 EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
                                         void *VAddr, bool IsRecord,
-                                        bool SaveOutput) {
+                                        bool SaveOutput,
+                                        uint64_t &ReqPtrArgOffset) {
   if (!deviceIsReady(DeviceId)) {
     DP("Device %" PRId64 " is not ready\n", DeviceId);
     return OMP_TGT_FAIL;
   }
 
   DeviceTy &Device = *PM->Devices[DeviceId];
-  [[maybe_unused]] int Rc =
-      target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput);
+  [[maybe_unused]] int Rc = target_activate_rr(
+      Device, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset);
   assert(Rc == OFFLOAD_SUCCESS &&
          "__tgt_activate_record_replay unexpected failure!");
   return OMP_TGT_SUCCESS;

diff  --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 6c59bc1cf38a8bb..0da448fdbefa477 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -1725,9 +1725,11 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 /// and informing the record-replayer of whether to store the output
 /// in some file.
 int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
-                       bool isRecord, bool SaveOutput) {
+                       bool isRecord, bool SaveOutput,
+                       uint64_t &ReqPtrArgOffset) {
   return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr,
-                                            isRecord, SaveOutput);
+                                            isRecord, SaveOutput,
+                                            ReqPtrArgOffset);
 }
 
 /// Executes a kernel using pre-recorded information for loading to

diff  --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 2a06bdbd1b708c4..3eb500cbd4c97f2 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -42,7 +42,8 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
                   KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
 
 extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
-                              void *ReqAddr, bool isRecord, bool SaveOutput);
+                              void *ReqAddr, bool isRecord, bool SaveOutput,
+                              uint64_t &ReqPtrArgOffset);
 
 extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
                          void *DeviceMemory, int64_t DeviceMemorySize,

diff  --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 254be7db6e01a41..67304fdca61d447 100644
--- a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include <cstdint>
 #include <cstdlib>
 
 using namespace llvm;
@@ -128,8 +129,9 @@ int main(int argc, char **argv) {
 
   __tgt_register_lib(&Desc);
 
+  uint64_t ReqPtrArgOffset = 0;
   int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart,
-                                        false, VerifyOpt);
+                                        false, VerifyOpt, ReqPtrArgOffset);
 
   if (Rc != OMP_TGT_SUCCESS) {
     report_fatal_error("Cannot activate record replay\n");
@@ -149,6 +151,18 @@ int main(int argc, char **argv) {
               const_cast<char *>(DeviceMemoryMB.get()->getBuffer().data()),
               DeviceMemoryMB.get()->getBufferSize());
 
+  // If necessary, adjust pointer arguments.
+  if (ReqPtrArgOffset) {
+    for (auto *&Arg : TgtArgs) {
+      auto ArgInt = uintptr_t(Arg);
+      // Try to find pointer arguments.
+      if (ArgInt < uintptr_t(BAllocStart) ||
+          ArgInt >= uintptr_t(BAllocStart) + DeviceMemorySize)
+        continue;
+      Arg = reinterpret_cast<void *>(ArgInt - ReqPtrArgOffset);
+    }
+  }
+
   __tgt_target_kernel_replay(
       /* Loc */ nullptr, DeviceId, KernelEntry.addr, (char *)recored_data,
       DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(),


        


More information about the Openmp-commits mailing list