[Openmp-commits] [llvm] [openmp] [offload][OpenMP] Fix record replay when no memory is used (PR #201771)
Kevin Sala Penades via Openmp-commits
openmp-commits at lists.llvm.org
Wed Jun 17 20:22:27 PDT 2026
https://github.com/kevinsala updated https://github.com/llvm/llvm-project/pull/201771
>From c02f7b1d05dc137dbafa8c601bd14b8936e658a4 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Wed, 17 Jun 2026 18:10:56 -0700
Subject: [PATCH 1/2] [offload] Improve report printing for kernel recording
---
offload/libomptarget/device.cpp | 10 +++-
offload/libomptarget/omptarget.cpp | 3 +-
.../common/include/PluginInterface.h | 5 +-
.../common/include/RecordReplay.h | 14 ++++--
.../common/src/PluginInterface.cpp | 8 +--
.../common/src/RecordReplay.cpp | 50 +++++++++++++------
openmp/docs/design/Runtimes.rst | 17 ++++++-
7 files changed, 76 insertions(+), 31 deletions(-)
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index 546f679353544..c977c369aef43 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -95,13 +95,19 @@ llvm::Error DeviceTy::init() {
Int32Envar OMPX_RecordDevice("LIBOMPTARGET_RECORD_DEVICE", 0);
StringEnvar OMPX_RecordOutputDir("LIBOMPTARGET_RECORD_DIR", "");
BoolEnvar OMPX_EmitRecordReport("LIBOMPTARGET_RECORD_REPORT", false);
+ StringEnvar OMPX_RecordReportFile("LIBOMPTARGET_RECORD_REPORT_FILE", "");
if (OMPX_RecordDevice != RTLDeviceID)
return llvm::Error::success();
+ // Print report if it was enabled explicitly or a report file was indicated.
+ bool EmitReport =
+ OMPX_EmitRecordReport || !OMPX_RecordReportFile.get().empty();
+
Ret = RTL->initialize_record_replay(
RTLDeviceID, OMPX_RecordMemSize, nullptr,
- /*IsRecord=*/true, /*IsNative=*/true, OMPX_RecordOutput,
- OMPX_EmitRecordReport, OMPX_RecordOutputDir.get().c_str());
+ /*IsRecord=*/true, /*IsNative=*/true, OMPX_RecordOutput, EmitReport,
+ OMPX_RecordReportFile.get().c_str(),
+ OMPX_RecordOutputDir.get().c_str());
if (Ret != OFFLOAD_SUCCESS)
return error::createOffloadError(error::ErrorCode::BACKEND_FAILURE,
"failed to initialize RR in device %d\n",
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 17b215732d51b..4d7a01853c2e1 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2383,7 +2383,8 @@ int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
const char *OutputDirPath) {
return Device.RTL->initialize_record_replay(
Device.DeviceID, MemorySize, VAddr, IsRecord,
- /*IsNative=*/true, SaveOutput, EmitReport, OutputDirPath);
+ /*IsNative=*/true, SaveOutput, EmitReport, /*ReportFile=*/"",
+ OutputDirPath);
}
/// Executes a kernel using pre-recorded information for loading to
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index f50b07aad0209..6e208bbc7e056 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1252,7 +1252,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error initRecordReplay(int64_t Size, void *VAddr, bool IsRecord,
bool IsNative, bool SaveOutput, bool EmitReport,
- const char *OutputDirPath) {
+ const char *ReportFile, const char *OutputDirPath) {
if (RecordReplay)
return Plugin::error(error::ErrorCode::INVALID_ARGUMENT,
"RR already initialized");
@@ -1267,7 +1267,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
RecordReplay =
new NativeRecordReplayTy(Status, OutputDirPath ? OutputDirPath : "",
- SaveOutput, EmitReport, *this);
+ SaveOutput, EmitReport, ReportFile, *this);
return RecordReplay->init(Size, VAddr);
}
@@ -1587,6 +1587,7 @@ struct GenericPluginTy {
int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
void *VAddr, bool IsRecord, bool IsNative,
bool SaveOutput, bool EmitReport,
+ const char *ReportFile,
const char *OutputDirPath);
/// Loads the associated binary into the plugin and returns a handle to it.
diff --git a/offload/plugins-nextgen/common/include/RecordReplay.h b/offload/plugins-nextgen/common/include/RecordReplay.h
index 65a861cc8a0cc..6f083943131d2 100644
--- a/offload/plugins-nextgen/common/include/RecordReplay.h
+++ b/offload/plugins-nextgen/common/include/RecordReplay.h
@@ -80,9 +80,12 @@ struct RecordReplayTy {
/// Whether a memory snapshot should be recorded a kernel execution.
bool SaveOutput;
- /// Whether a report should be emitted afther the recording.
+ /// Whether a report should be emitted after the recording.
bool EmitReport;
+ /// The name of the file where to emit the record report.
+ std::string ReportFile;
+
/// Reference to the corresponding device.
GenericDeviceTy &Device;
@@ -157,13 +160,14 @@ struct RecordReplayTy {
/// Tracker of record replay instances.
std::unordered_set<InstanceTy, InstanceHasher> Instances;
+ SmallVector<const InstanceTy *> OrderedInstances;
std::mutex InstancesLock;
public:
RecordReplayTy(StatusTy Status, StringRef OutputDirectoryStr, bool SaveOutput,
- bool EmitReport, GenericDeviceTy &Device)
+ bool EmitReport, StringRef ReportFile, GenericDeviceTy &Device)
: Status(Status), SaveOutput(SaveOutput), EmitReport(EmitReport),
- Device(Device) {
+ ReportFile(ReportFile.str()), Device(Device) {
if (OutputDirectoryStr == "")
OutputDirectory = std::filesystem::current_path();
else
@@ -260,10 +264,10 @@ struct RecordReplayTy {
/// The native kernel record replay support.
struct NativeRecordReplayTy : public RecordReplayTy {
NativeRecordReplayTy(StatusTy Status, StringRef OutputDirectoryStr,
- bool SaveOutput, bool EmitReport,
+ bool SaveOutput, bool EmitReport, StringRef ReportFile,
GenericDeviceTy &Device)
: RecordReplayTy(Status, OutputDirectoryStr, SaveOutput, EmitReport,
- Device) {}
+ ReportFile, Device) {}
private:
Error recordPrologueImpl(const GenericKernelTy &Kernel,
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f8f362cf2b4ed..a17a7a9d3e7e3 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1498,13 +1498,13 @@ int32_t GenericPluginTy::is_data_exchangable(int32_t SrcDeviceId,
int32_t GenericPluginTy::initialize_record_replay(
int32_t DeviceId, int64_t MemorySize, void *VAddr, bool IsRecord,
- bool IsNative, bool SaveOutput, bool EmitReport,
+ bool IsNative, bool SaveOutput, bool EmitReport, const char *ReportFile,
const char *OutputDirPath) {
GenericDeviceTy &Device = getDevice(DeviceId);
- if (auto Err =
- Device.initRecordReplay(MemorySize, VAddr, IsRecord, IsNative,
- SaveOutput, EmitReport, OutputDirPath)) {
+ if (auto Err = Device.initRecordReplay(MemorySize, VAddr, IsRecord, IsNative,
+ SaveOutput, EmitReport, ReportFile,
+ OutputDirPath)) {
REPORT() << "Failure to initialize RR with " << MemorySize
<< " bytes on device " << DeviceId << ": "
<< toString(std::move(Err));
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index e89b516983e41..a09a127c43ca4 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -88,20 +88,34 @@ Error RecordReplayTy::deinit() {
}
Error RecordReplayTy::emitInstanceReport() {
+ llvm::raw_ostream *OutStream = &llvm::outs();
+ std::unique_ptr<llvm::raw_fd_ostream> FileOut;
+
+ if (!ReportFile.empty()) {
+ // The report file is emitted in the output directory.
+ std::string ReportFilePath =
+ (std::filesystem::absolute(OutputDirectory) / ReportFile).string();
+ std::error_code EC;
+ FileOut = std::make_unique<llvm::raw_fd_ostream>(ReportFilePath, EC);
+ if (EC)
+ return Plugin::error(ErrorCode::HOST_IO, "saving report file");
+ OutStream = FileOut.get();
+ }
+
std::lock_guard<std::mutex> LG(InstancesLock);
- llvm::outs() << "=== Kernel Record Report ===\n";
- llvm::outs() << "Directory: "
- << std::filesystem::absolute(OutputDirectory).string() << "\n";
- llvm::outs() << "Total Instances: " << Instances.size() << "\n";
- llvm::outs() << "JSON Filename, Kernel Name, Time (ns), Occurrences:\n";
+ *OutStream << "=== Kernel Record Report ===\n";
+ *OutStream << "Directory: "
+ << std::filesystem::absolute(OutputDirectory).string() << "\n";
+ *OutStream << "Total Instances: " << OrderedInstances.size() << "\n";
+ *OutStream << "JSON Filename, Kernel Name, Time (ns), Occurrences:\n";
SmallString<128> Filename;
- for (const auto &Inst : Instances)
- llvm::outs()
- << getFilename(Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str()
- << ", " << Inst.Kernel.getName() << ", " << Inst.getRecordedTimeNs()
- << ", " << Inst.Occurrences << "\n";
- llvm::outs() << "=== End Kernel Record Report ===\n";
+ for (const auto *Inst : OrderedInstances)
+ *OutStream
+ << getFilename(*Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str()
+ << ", " << Inst->Kernel.getName() << ", " << Inst->getRecordedTimeNs()
+ << ", " << Inst->Occurrences << "\n";
+ *OutStream << "=== End Kernel Record Report ===\n";
return Plugin::success();
}
@@ -114,18 +128,24 @@ RecordReplayTy::registerInstance(const GenericKernelTy &Kernel,
std::lock_guard<std::mutex> LG(InstancesLock);
auto [It, Inserted] = Instances.emplace(Kernel, NumTeams, NumThreads,
SharedMemorySize, ReplayOutcome);
+ // Keep insertion order.
+ if (Inserted)
+ OrderedInstances.push_back(&(*It));
+
// Increase the number of occurrences.
It->Occurrences += 1;
- return {*It, Inserted};
+
+ // Return reference and whether it was registered for the first time. Notice
+ // that registering an unregistered instance counts as a new registration.
+ return {*It, (It->Occurrences == 1)};
}
Error RecordReplayTy::unregisterInstance(const InstanceTy &Instance) {
assert(isReplaying() && "Cannot unregister instance when recording.");
+ // Do not remove it, it may be reused in the future.
std::lock_guard<std::mutex> LG(InstancesLock);
- size_t Erased = Instances.erase(Instance);
- if (Erased != 1)
- return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid instance");
+ Instance.Occurrences = 0;
return Plugin::success();
}
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 8b3b7e9bed0c6..098946b2743d6 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1279,6 +1279,7 @@ is provided below.
* ``LIBOMPTARGET_RECORD=[TRUE/FALSE] (default FALSE)``
* ``LIBOMPTARGET_RECORD_DIR=<Filepath>``
* ``LIBOMPTARGET_RECORD_REPORT=[TRUE/FALSE] (default FALSE)``
+* ``LIBOMPTARGET_RECORD_REPORT_FILE=<Filename>``
* ``LIBOMPTARGET_RECORD_MEMSIZE=<Num> (default 8*1024*1024*1024)``
* ``LIBOMPTARGET_RECORD_DEVICE=<Num> (default 0)``
* ``LIBOMPTARGET_RECORD_OUTPUT=[TRUE/FALSE] (default TRUE)``
@@ -1309,8 +1310,20 @@ LIBOMPTARGET_RECORD_REPORT
""""""""""""""""""""""""""
This environment variable is used to instruct the runtime to emit a summary of
-the recorded kernel instances and their associated JSON files. By default, no
-report is emitted.
+the recorded kernel instances and their associated JSON files. When enabled, the
+report is emitted in the standard output. The environment variable
+:ref:`LIBOMPTARGET_RECORD_REPORT_FILE` can be used to indicate a file where to
+emit the report. By default, no report is emitted.
+
+.. _libomptarget_record_report_file:
+
+LIBOMPTARGET_RECORD_REPORT_FILE
+"""""""""""""""""""""""""""""""
+
+This environment variable is used to instruct the runtime to emit the recording
+report to a specific output file. The file will be created within the recording
+directory. Note that it is not needed to use :ref:`LIBOMPTARGET_RECORD_REPORT`
+when setting this enviornment variable.
LIBOMPTARGET_RECORD_MEMSIZE
"""""""""""""""""""""""""""
>From 0f56de1e5fd353b55702bb66a77f394d9da90902 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 5 Jun 2026 00:27:07 -0700
Subject: [PATCH 2/2] [offload][OpenMP] Fix record replay when no memory is
used
Progams that do not use any memory (e.g., no mappings) were failing because
we were trying to execute zero size transfers.
---
offload/libomptarget/omptarget.cpp | 30 +++++++++++--------
.../common/src/RecordReplay.cpp | 24 ++++++++-------
.../record-replay-empty-memory.cpp | 26 ++++++++++++++++
.../kernelreplay/llvm-omp-kernel-replay.cpp | 3 +-
4 files changed, 60 insertions(+), 23 deletions(-)
create mode 100644 offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 4d7a01853c2e1..facc8a0a7b030 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2440,6 +2440,7 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
// Initialize the device memory of each global.
for (int32_t I = 0; I < NumGlobals; ++I) {
assert(Globals[I].AuxAddr && "Global has no AuxAddr.");
+ assert(Globals[I].Size && "Global has Size zero.");
// Initialize the value of the global in the device.
int Ret = Device.submitData(Symbols[I + 1].DevPtr, Globals[I].AuxAddr,
@@ -2450,25 +2451,30 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
}
}
- // Reuse a previous device allocation or allocate a new device buffer.
+ // Reuse a previous device allocation or allocate a new device buffer. Do not
+ // allocate anything if the size is zero.
void *&TgtPtr = ReuseDeviceAlloc;
- if (!TgtPtr)
+ if (!TgtPtr && DeviceMemorySize) {
TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
TARGET_ALLOC_DEFAULT);
- if (!TgtPtr) {
- REPORT() << "Failed to allocate device memory.";
- return OFFLOAD_FAIL;
+ if (!TgtPtr) {
+ REPORT() << "Failed to allocate device memory.";
+ return OFFLOAD_FAIL;
+ }
}
// Save the device allocation for future replays of the same kernel.
if (ReplayOutcome)
ReplayOutcome->ReplayDeviceAlloc = TgtPtr;
- int Ret =
- Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
- if (Ret != OFFLOAD_SUCCESS) {
- REPORT() << "Failed to submit data to a global.";
- return OFFLOAD_FAIL;
+ // Initialize the device memory.
+ if (DeviceMemorySize) {
+ int Ret =
+ Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
+ if (Ret != OFFLOAD_SUCCESS) {
+ REPORT() << "Failed to submit data to the device memory.";
+ return OFFLOAD_FAIL;
+ }
}
KernelArgsTy KernelArgs{};
@@ -2487,8 +2493,8 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
KernelExtraArgsTy KernelExtraArgs{};
KernelExtraArgs.ReplayOutcome = ReplayOutcome;
- Ret = Device.launchKernel(Symbols[0].DevPtr, TgtArgs, TgtOffsets, KernelArgs,
- &KernelExtraArgs, AsyncInfo);
+ int Ret = Device.launchKernel(Symbols[0].DevPtr, TgtArgs, TgtOffsets,
+ KernelArgs, &KernelExtraArgs, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT() << "Failed to launch kernel replay.";
return OFFLOAD_FAIL;
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index a09a127c43ca4..fe5629483b53e 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -338,17 +338,21 @@ Error NativeRecordReplayTy::recordSnapshot(StringRef Filename) {
uint64_t RecordSize = CurrentSize;
AllocationLock.unlock();
- ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
- WritableMemoryBuffer::getNewUninitMemBuffer(RecordSize);
- if (!DeviceMemoryMB)
- return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
- "creating MemoryBuffer for device memory");
-
- if (auto Err = Device.dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
- StartAddr, RecordSize, nullptr))
- return Err;
+ StringRef DeviceMemory;
+ if (RecordSize) {
+ ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
+ WritableMemoryBuffer::getNewUninitMemBuffer(RecordSize);
+ if (!DeviceMemoryMB)
+ return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
+ "creating MemoryBuffer for device memory");
+
+ if (auto Err = Device.dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
+ StartAddr, RecordSize, nullptr))
+ return Err;
- StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), RecordSize);
+ DeviceMemory =
+ StringRef(DeviceMemoryMB.get()->getBufferStart(), RecordSize);
+ }
std::error_code EC;
raw_fd_ostream OS(Filename, EC);
diff --git a/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp b/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp
new file mode 100644
index 0000000000000..0705c6d66ac8e
--- /dev/null
+++ b/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp
@@ -0,0 +1,26 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: rm -rf %t.testdir
+// RUN: mkdir -p %t.testdir
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify {}
+// clang-format on
+
+// REQUIRES: gpu
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: x86_64-unknown-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: intelgpu
+
+#include <cstdint>
+#include <cstdio>
+
+int main() {
+#pragma omp target teams num_teams(256)
+ {
+ }
+
+ // CHECK: PASS
+ printf("PASS\n");
+}
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index a5bda7a0f0444..4335002fd8c77 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -130,7 +130,8 @@ Error verifyReplayOutput(StringRef RecordOutputFilename,
if (!ReplayOutputBufferOrErr)
return createErr("failed to read the kernel replay output file");
- // Compare record and replay outputs to verify they match.
+ // Compare record and replay outputs to verify they match. If they are both
+ // empty, the verification is successful.
StringRef RecordOutput = RecordOutputBufferOrErr.get()->getBuffer();
StringRef ReplayOutput = ReplayOutputBufferOrErr.get()->getBuffer();
if (RecordOutput != ReplayOutput)
More information about the Openmp-commits
mailing list