[Openmp-commits] [llvm] [openmp] [offload][OpenMP] Fix record replay when no memory is used (PR #201771)

Kevin Sala Penades via Openmp-commits openmp-commits at lists.llvm.org
Wed Jun 17 20:22:27 PDT 2026


https://github.com/kevinsala updated https://github.com/llvm/llvm-project/pull/201771

>From c02f7b1d05dc137dbafa8c601bd14b8936e658a4 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Wed, 17 Jun 2026 18:10:56 -0700
Subject: [PATCH 1/2] [offload] Improve report printing for kernel recording

---
 offload/libomptarget/device.cpp               | 10 +++-
 offload/libomptarget/omptarget.cpp            |  3 +-
 .../common/include/PluginInterface.h          |  5 +-
 .../common/include/RecordReplay.h             | 14 ++++--
 .../common/src/PluginInterface.cpp            |  8 +--
 .../common/src/RecordReplay.cpp               | 50 +++++++++++++------
 openmp/docs/design/Runtimes.rst               | 17 ++++++-
 7 files changed, 76 insertions(+), 31 deletions(-)

diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index 546f679353544..c977c369aef43 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -95,13 +95,19 @@ llvm::Error DeviceTy::init() {
     Int32Envar OMPX_RecordDevice("LIBOMPTARGET_RECORD_DEVICE", 0);
     StringEnvar OMPX_RecordOutputDir("LIBOMPTARGET_RECORD_DIR", "");
     BoolEnvar OMPX_EmitRecordReport("LIBOMPTARGET_RECORD_REPORT", false);
+    StringEnvar OMPX_RecordReportFile("LIBOMPTARGET_RECORD_REPORT_FILE", "");
     if (OMPX_RecordDevice != RTLDeviceID)
       return llvm::Error::success();
 
+    // Print report if it was enabled explicitly or a report file was indicated.
+    bool EmitReport =
+        OMPX_EmitRecordReport || !OMPX_RecordReportFile.get().empty();
+
     Ret = RTL->initialize_record_replay(
         RTLDeviceID, OMPX_RecordMemSize, nullptr,
-        /*IsRecord=*/true, /*IsNative=*/true, OMPX_RecordOutput,
-        OMPX_EmitRecordReport, OMPX_RecordOutputDir.get().c_str());
+        /*IsRecord=*/true, /*IsNative=*/true, OMPX_RecordOutput, EmitReport,
+        OMPX_RecordReportFile.get().c_str(),
+        OMPX_RecordOutputDir.get().c_str());
     if (Ret != OFFLOAD_SUCCESS)
       return error::createOffloadError(error::ErrorCode::BACKEND_FAILURE,
                                        "failed to initialize RR in device %d\n",
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 17b215732d51b..4d7a01853c2e1 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2383,7 +2383,8 @@ int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
                        const char *OutputDirPath) {
   return Device.RTL->initialize_record_replay(
       Device.DeviceID, MemorySize, VAddr, IsRecord,
-      /*IsNative=*/true, SaveOutput, EmitReport, OutputDirPath);
+      /*IsNative=*/true, SaveOutput, EmitReport, /*ReportFile=*/"",
+      OutputDirPath);
 }
 
 /// Executes a kernel using pre-recorded information for loading to
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index f50b07aad0209..6e208bbc7e056 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1252,7 +1252,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   Error initRecordReplay(int64_t Size, void *VAddr, bool IsRecord,
                          bool IsNative, bool SaveOutput, bool EmitReport,
-                         const char *OutputDirPath) {
+                         const char *ReportFile, const char *OutputDirPath) {
     if (RecordReplay)
       return Plugin::error(error::ErrorCode::INVALID_ARGUMENT,
                            "RR already initialized");
@@ -1267,7 +1267,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
     RecordReplay =
         new NativeRecordReplayTy(Status, OutputDirPath ? OutputDirPath : "",
-                                 SaveOutput, EmitReport, *this);
+                                 SaveOutput, EmitReport, ReportFile, *this);
     return RecordReplay->init(Size, VAddr);
   }
 
@@ -1587,6 +1587,7 @@ struct GenericPluginTy {
   int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
                                    void *VAddr, bool IsRecord, bool IsNative,
                                    bool SaveOutput, bool EmitReport,
+                                   const char *ReportFile,
                                    const char *OutputDirPath);
 
   /// Loads the associated binary into the plugin and returns a handle to it.
diff --git a/offload/plugins-nextgen/common/include/RecordReplay.h b/offload/plugins-nextgen/common/include/RecordReplay.h
index 65a861cc8a0cc..6f083943131d2 100644
--- a/offload/plugins-nextgen/common/include/RecordReplay.h
+++ b/offload/plugins-nextgen/common/include/RecordReplay.h
@@ -80,9 +80,12 @@ struct RecordReplayTy {
   /// Whether a memory snapshot should be recorded a kernel execution.
   bool SaveOutput;
 
-  /// Whether a report should be emitted afther the recording.
+  /// Whether a report should be emitted after the recording.
   bool EmitReport;
 
+  /// The name of the file where to emit the record report.
+  std::string ReportFile;
+
   /// Reference to the corresponding device.
   GenericDeviceTy &Device;
 
@@ -157,13 +160,14 @@ struct RecordReplayTy {
 
   /// Tracker of record replay instances.
   std::unordered_set<InstanceTy, InstanceHasher> Instances;
+  SmallVector<const InstanceTy *> OrderedInstances;
   std::mutex InstancesLock;
 
 public:
   RecordReplayTy(StatusTy Status, StringRef OutputDirectoryStr, bool SaveOutput,
-                 bool EmitReport, GenericDeviceTy &Device)
+                 bool EmitReport, StringRef ReportFile, GenericDeviceTy &Device)
       : Status(Status), SaveOutput(SaveOutput), EmitReport(EmitReport),
-        Device(Device) {
+        ReportFile(ReportFile.str()), Device(Device) {
     if (OutputDirectoryStr == "")
       OutputDirectory = std::filesystem::current_path();
     else
@@ -260,10 +264,10 @@ struct RecordReplayTy {
 /// The native kernel record replay support.
 struct NativeRecordReplayTy : public RecordReplayTy {
   NativeRecordReplayTy(StatusTy Status, StringRef OutputDirectoryStr,
-                       bool SaveOutput, bool EmitReport,
+                       bool SaveOutput, bool EmitReport, StringRef ReportFile,
                        GenericDeviceTy &Device)
       : RecordReplayTy(Status, OutputDirectoryStr, SaveOutput, EmitReport,
-                       Device) {}
+                       ReportFile, Device) {}
 
 private:
   Error recordPrologueImpl(const GenericKernelTy &Kernel,
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f8f362cf2b4ed..a17a7a9d3e7e3 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1498,13 +1498,13 @@ int32_t GenericPluginTy::is_data_exchangable(int32_t SrcDeviceId,
 
 int32_t GenericPluginTy::initialize_record_replay(
     int32_t DeviceId, int64_t MemorySize, void *VAddr, bool IsRecord,
-    bool IsNative, bool SaveOutput, bool EmitReport,
+    bool IsNative, bool SaveOutput, bool EmitReport, const char *ReportFile,
     const char *OutputDirPath) {
   GenericDeviceTy &Device = getDevice(DeviceId);
 
-  if (auto Err =
-          Device.initRecordReplay(MemorySize, VAddr, IsRecord, IsNative,
-                                  SaveOutput, EmitReport, OutputDirPath)) {
+  if (auto Err = Device.initRecordReplay(MemorySize, VAddr, IsRecord, IsNative,
+                                         SaveOutput, EmitReport, ReportFile,
+                                         OutputDirPath)) {
     REPORT() << "Failure to initialize RR with " << MemorySize
              << " bytes on device " << DeviceId << ": "
              << toString(std::move(Err));
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index e89b516983e41..a09a127c43ca4 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -88,20 +88,34 @@ Error RecordReplayTy::deinit() {
 }
 
 Error RecordReplayTy::emitInstanceReport() {
+  llvm::raw_ostream *OutStream = &llvm::outs();
+  std::unique_ptr<llvm::raw_fd_ostream> FileOut;
+
+  if (!ReportFile.empty()) {
+    // The report file is emitted in the output directory.
+    std::string ReportFilePath =
+        (std::filesystem::absolute(OutputDirectory) / ReportFile).string();
+    std::error_code EC;
+    FileOut = std::make_unique<llvm::raw_fd_ostream>(ReportFilePath, EC);
+    if (EC)
+      return Plugin::error(ErrorCode::HOST_IO, "saving report file");
+    OutStream = FileOut.get();
+  }
+
   std::lock_guard<std::mutex> LG(InstancesLock);
-  llvm::outs() << "=== Kernel Record Report ===\n";
-  llvm::outs() << "Directory: "
-               << std::filesystem::absolute(OutputDirectory).string() << "\n";
-  llvm::outs() << "Total Instances: " << Instances.size() << "\n";
-  llvm::outs() << "JSON Filename, Kernel Name, Time (ns), Occurrences:\n";
+  *OutStream << "=== Kernel Record Report ===\n";
+  *OutStream << "Directory: "
+             << std::filesystem::absolute(OutputDirectory).string() << "\n";
+  *OutStream << "Total Instances: " << OrderedInstances.size() << "\n";
+  *OutStream << "JSON Filename, Kernel Name, Time (ns), Occurrences:\n";
 
   SmallString<128> Filename;
-  for (const auto &Inst : Instances)
-    llvm::outs()
-        << getFilename(Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str()
-        << ", " << Inst.Kernel.getName() << ", " << Inst.getRecordedTimeNs()
-        << ", " << Inst.Occurrences << "\n";
-  llvm::outs() << "=== End Kernel Record Report ===\n";
+  for (const auto *Inst : OrderedInstances)
+    *OutStream
+        << getFilename(*Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str()
+        << ", " << Inst->Kernel.getName() << ", " << Inst->getRecordedTimeNs()
+        << ", " << Inst->Occurrences << "\n";
+  *OutStream << "=== End Kernel Record Report ===\n";
 
   return Plugin::success();
 }
@@ -114,18 +128,24 @@ RecordReplayTy::registerInstance(const GenericKernelTy &Kernel,
   std::lock_guard<std::mutex> LG(InstancesLock);
   auto [It, Inserted] = Instances.emplace(Kernel, NumTeams, NumThreads,
                                           SharedMemorySize, ReplayOutcome);
+  // Keep insertion order.
+  if (Inserted)
+    OrderedInstances.push_back(&(*It));
+
   // Increase the number of occurrences.
   It->Occurrences += 1;
-  return {*It, Inserted};
+
+  // Return reference and whether it was registered for the first time. Notice
+  // that registering an unregistered instance counts as a new registration.
+  return {*It, (It->Occurrences == 1)};
 }
 
 Error RecordReplayTy::unregisterInstance(const InstanceTy &Instance) {
   assert(isReplaying() && "Cannot unregister instance when recording.");
 
+  // Do not remove it, it may be reused in the future.
   std::lock_guard<std::mutex> LG(InstancesLock);
-  size_t Erased = Instances.erase(Instance);
-  if (Erased != 1)
-    return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid instance");
+  Instance.Occurrences = 0;
   return Plugin::success();
 }
 
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 8b3b7e9bed0c6..098946b2743d6 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1279,6 +1279,7 @@ is provided below.
 * ``LIBOMPTARGET_RECORD=[TRUE/FALSE] (default FALSE)``
 * ``LIBOMPTARGET_RECORD_DIR=<Filepath>``
 * ``LIBOMPTARGET_RECORD_REPORT=[TRUE/FALSE] (default FALSE)``
+* ``LIBOMPTARGET_RECORD_REPORT_FILE=<Filename>``
 * ``LIBOMPTARGET_RECORD_MEMSIZE=<Num> (default 8*1024*1024*1024)``
 * ``LIBOMPTARGET_RECORD_DEVICE=<Num> (default 0)``
 * ``LIBOMPTARGET_RECORD_OUTPUT=[TRUE/FALSE] (default TRUE)``
@@ -1309,8 +1310,20 @@ LIBOMPTARGET_RECORD_REPORT
 """"""""""""""""""""""""""
 
 This environment variable is used to instruct the runtime to emit a summary of
-the recorded kernel instances and their associated JSON files. By default, no
-report is emitted.
+the recorded kernel instances and their associated JSON files. When enabled, the
+report is emitted in the standard output. The environment variable
+:ref:`LIBOMPTARGET_RECORD_REPORT_FILE` can be used to indicate a file where to
+emit the report. By default, no report is emitted.
+
+.. _libomptarget_record_report_file:
+
+LIBOMPTARGET_RECORD_REPORT_FILE
+"""""""""""""""""""""""""""""""
+
+This environment variable is used to instruct the runtime to emit the recording
+report to a specific output file. The file will be created within the recording
+directory. Note that it is not needed to use :ref:`LIBOMPTARGET_RECORD_REPORT`
+when setting this enviornment variable.
 
 LIBOMPTARGET_RECORD_MEMSIZE
 """""""""""""""""""""""""""

>From 0f56de1e5fd353b55702bb66a77f394d9da90902 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 5 Jun 2026 00:27:07 -0700
Subject: [PATCH 2/2] [offload][OpenMP] Fix record replay when no memory is
 used

Progams that do not use any memory (e.g., no mappings) were failing because
we were trying to execute zero size transfers.
---
 offload/libomptarget/omptarget.cpp            | 30 +++++++++++--------
 .../common/src/RecordReplay.cpp               | 24 ++++++++-------
 .../record-replay-empty-memory.cpp            | 26 ++++++++++++++++
 .../kernelreplay/llvm-omp-kernel-replay.cpp   |  3 +-
 4 files changed, 60 insertions(+), 23 deletions(-)
 create mode 100644 offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp

diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 4d7a01853c2e1..facc8a0a7b030 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2440,6 +2440,7 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
   // Initialize the device memory of each global.
   for (int32_t I = 0; I < NumGlobals; ++I) {
     assert(Globals[I].AuxAddr && "Global has no AuxAddr.");
+    assert(Globals[I].Size && "Global has Size zero.");
 
     // Initialize the value of the global in the device.
     int Ret = Device.submitData(Symbols[I + 1].DevPtr, Globals[I].AuxAddr,
@@ -2450,25 +2451,30 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
     }
   }
 
-  // Reuse a previous device allocation or allocate a new device buffer.
+  // Reuse a previous device allocation or allocate a new device buffer. Do not
+  // allocate anything if the size is zero.
   void *&TgtPtr = ReuseDeviceAlloc;
-  if (!TgtPtr)
+  if (!TgtPtr && DeviceMemorySize) {
     TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
                               TARGET_ALLOC_DEFAULT);
-  if (!TgtPtr) {
-    REPORT() << "Failed to allocate device memory.";
-    return OFFLOAD_FAIL;
+    if (!TgtPtr) {
+      REPORT() << "Failed to allocate device memory.";
+      return OFFLOAD_FAIL;
+    }
   }
 
   // Save the device allocation for future replays of the same kernel.
   if (ReplayOutcome)
     ReplayOutcome->ReplayDeviceAlloc = TgtPtr;
 
-  int Ret =
-      Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
-  if (Ret != OFFLOAD_SUCCESS) {
-    REPORT() << "Failed to submit data to a global.";
-    return OFFLOAD_FAIL;
+  // Initialize the device memory.
+  if (DeviceMemorySize) {
+    int Ret =
+        Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
+    if (Ret != OFFLOAD_SUCCESS) {
+      REPORT() << "Failed to submit data to the device memory.";
+      return OFFLOAD_FAIL;
+    }
   }
 
   KernelArgsTy KernelArgs{};
@@ -2487,8 +2493,8 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
   KernelExtraArgsTy KernelExtraArgs{};
   KernelExtraArgs.ReplayOutcome = ReplayOutcome;
 
-  Ret = Device.launchKernel(Symbols[0].DevPtr, TgtArgs, TgtOffsets, KernelArgs,
-                            &KernelExtraArgs, AsyncInfo);
+  int Ret = Device.launchKernel(Symbols[0].DevPtr, TgtArgs, TgtOffsets,
+                                KernelArgs, &KernelExtraArgs, AsyncInfo);
   if (Ret != OFFLOAD_SUCCESS) {
     REPORT() << "Failed to launch kernel replay.";
     return OFFLOAD_FAIL;
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index a09a127c43ca4..fe5629483b53e 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -338,17 +338,21 @@ Error NativeRecordReplayTy::recordSnapshot(StringRef Filename) {
   uint64_t RecordSize = CurrentSize;
   AllocationLock.unlock();
 
-  ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
-      WritableMemoryBuffer::getNewUninitMemBuffer(RecordSize);
-  if (!DeviceMemoryMB)
-    return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
-                         "creating MemoryBuffer for device memory");
-
-  if (auto Err = Device.dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
-                                     StartAddr, RecordSize, nullptr))
-    return Err;
+  StringRef DeviceMemory;
+  if (RecordSize) {
+    ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
+        WritableMemoryBuffer::getNewUninitMemBuffer(RecordSize);
+    if (!DeviceMemoryMB)
+      return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
+                           "creating MemoryBuffer for device memory");
+
+    if (auto Err = Device.dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
+                                       StartAddr, RecordSize, nullptr))
+      return Err;
 
-  StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), RecordSize);
+    DeviceMemory =
+        StringRef(DeviceMemoryMB.get()->getBufferStart(), RecordSize);
+  }
 
   std::error_code EC;
   raw_fd_ostream OS(Filename, EC);
diff --git a/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp b/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp
new file mode 100644
index 0000000000000..0705c6d66ac8e
--- /dev/null
+++ b/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp
@@ -0,0 +1,26 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: rm -rf %t.testdir
+// RUN: mkdir -p %t.testdir
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify {}
+// clang-format on
+
+// REQUIRES: gpu
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: x86_64-unknown-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: intelgpu
+
+#include <cstdint>
+#include <cstdio>
+
+int main() {
+#pragma omp target teams num_teams(256)
+  {
+  }
+
+  // CHECK: PASS
+  printf("PASS\n");
+}
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index a5bda7a0f0444..4335002fd8c77 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -130,7 +130,8 @@ Error verifyReplayOutput(StringRef RecordOutputFilename,
   if (!ReplayOutputBufferOrErr)
     return createErr("failed to read the kernel replay output file");
 
-  // Compare record and replay outputs to verify they match.
+  // Compare record and replay outputs to verify they match. If they are both
+  // empty, the verification is successful.
   StringRef RecordOutput = RecordOutputBufferOrErr.get()->getBuffer();
   StringRef ReplayOutput = ReplayOutputBufferOrErr.get()->getBuffer();
   if (RecordOutput != ReplayOutput)



More information about the Openmp-commits mailing list