[Openmp-commits] [openmp] [OpenMP][FIX] Ensure recording works properly w/ late allocations (PR #72009)
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Fri Nov 10 17:28:09 PST 2023
https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/72009
>From 60f2a0b9fb2167dc195b3547b5aeed3af00ddd15 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 10 Nov 2023 17:02:08 -0800
Subject: [PATCH 1/2] [OpenMP][NFC] Remove std::move to silence warnings
---
.../plugins-nextgen/amdgpu/src/rtl.cpp | 6 +++---
.../common/PluginInterface/PluginInterface.cpp | 2 +-
.../libomptarget/plugins-nextgen/cuda/src/rtl.cpp | 14 +++++++-------
3 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index a529c379844e904..c66279fc0bcc5eb 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2653,7 +2653,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
// Allocate and construct the AMDGPU kernel.
AMDGPUKernelTy AMDGPUKernel(Name);
if (auto Err = AMDGPUKernel.init(*this, Image))
- return std::move(Err);
+ return Err;
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
@@ -2661,12 +2661,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
/*NumBlocks=*/1ul, KernelArgs,
/*Args=*/nullptr, AsyncInfoWrapper))
- return std::move(Err);
+ return Err;
Error Err = Plugin::success();
AsyncInfoWrapper.finalize(Err);
- return std::move(Err);
+ return Err;
}
/// Envar for controlling the number of HSA queues per device. High number of
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 3f798a908e7361e..663e4bfa3536b5a 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -719,7 +719,7 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
for (DeviceImageTy *Image : LoadedImages)
if (auto Err = callGlobalDestructors(Plugin, *Image))
- return std::move(Err);
+ return Err;
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index a6e28574a7f08e3..d1473c94af8a630 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -1106,30 +1106,30 @@ struct CUDADeviceTy : public GenericDeviceTy {
for (auto [Name, Priority] : Funcs) {
GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]);
if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
- return std::move(Err);
+ return Err;
}
// Copy the local buffer to the device.
if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(),
FunctionPtrs.size() * sizeof(void *), nullptr))
- return std::move(Err);
+ return Err;
// Copy the created buffer to the appropriate symbols so the kernel can
// iterate through them.
GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
sizeof(void *), &GlobalPtrStart);
if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
- return std::move(Err);
+ return Err;
GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
sizeof(void *), &GlobalPtrStop);
if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
- return std::move(Err);
+ return Err;
CUDAKernelTy CUDAKernel(KernelName);
if (auto Err = CUDAKernel.init(*this, Image))
- return std::move(Err);
+ return Err;
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
@@ -1137,7 +1137,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
/*NumBlocks=*/1ul, KernelArgs, nullptr,
AsyncInfoWrapper))
- return std::move(Err);
+ return Err;
Error Err = Plugin::success();
AsyncInfoWrapper.finalize(Err);
@@ -1145,7 +1145,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
return Plugin::error("Failed to free memory for global buffer");
- return std::move(Err);
+ return Err;
}
/// Stream manager for CUDA streams.
>From 8108c345046df81f79e0d7c1c52df2ea89ba017f Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 10 Nov 2023 17:20:49 -0800
Subject: [PATCH 2/2] [OpenMP][FIX] Ensure recording works properly w/ late
allocations
---
.../PluginInterface/PluginInterface.cpp | 50 +++++++++++--------
1 file changed, 28 insertions(+), 22 deletions(-)
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 663e4bfa3536b5a..08946e21035014e 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -231,9 +231,9 @@ struct RecordReplayTy {
OS.close();
}
- void saveKernelDescr(const char *Name, void **ArgPtrs, ptrdiff_t *ArgOffsets,
- int32_t NumArgs, uint64_t NumTeamsClause,
- uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
+ void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs,
+ uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
+ uint64_t LoopTripCount) {
json::Object JsonKernelInfo;
JsonKernelInfo["Name"] = Name;
JsonKernelInfo["NumArgs"] = NumArgs;
@@ -251,7 +251,7 @@ struct RecordReplayTy {
json::Array JsonArgOffsets;
for (int I = 0; I < NumArgs; ++I)
- JsonArgOffsets.push_back(ArgOffsets[I]);
+ JsonArgOffsets.push_back(0);
JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
SmallString<128> JsonFilename = {Name, ".json"};
@@ -427,6 +427,11 @@ Expected<KernelLaunchEnvironmentTy *>
GenericKernelTy::getKernelLaunchEnvironment(
GenericDeviceTy &GenericDevice,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ // Ctor/Dtor have no arguments, replaying uses the original kernel launch
+ // environment.
+ if (isCtorOrDtor() || RecordReplay.isReplaying())
+ return nullptr;
+
// TODO: Check if the kernel needs a launch environment.
auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
/*HostPtr=*/nullptr,
@@ -501,6 +506,15 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
getNumBlocks(GenericDevice, KernelArgs.NumTeams, KernelArgs.Tripcount,
NumThreads, KernelArgs.ThreadLimit[0] > 0);
+ // Record the kernel description after we modified the argument count and num
+ // blocks/threads.
+ if (RecordReplay.isRecording()) {
+ RecordReplay.saveImage(getName(), getImage());
+ RecordReplay.saveKernelInput(getName(), getImage());
+ RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs,
+ NumBlocks, NumThreads, KernelArgs.Tripcount);
+ }
+
if (auto Err =
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
return Err;
@@ -517,16 +531,20 @@ void *GenericKernelTy::prepareArgs(
if (isCtorOrDtor())
return nullptr;
- NumArgs += 1;
+ uint32_t KLEOffset = !!KernelLaunchEnvironment;
+ NumArgs += KLEOffset;
Args.resize(NumArgs);
Ptrs.resize(NumArgs);
- Ptrs[0] = KernelLaunchEnvironment;
- Args[0] = &Ptrs[0];
+ if (KernelLaunchEnvironment) {
+ Ptrs[0] = KernelLaunchEnvironment;
+ Args[0] = &Ptrs[0];
+ }
- for (int I = 1; I < NumArgs; ++I) {
- Ptrs[I] = (void *)((intptr_t)ArgPtrs[I - 1] + ArgOffsets[I - 1]);
+ for (int I = KLEOffset; I < NumArgs; ++I) {
+ Ptrs[I] =
+ (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
Args[I] = &Ptrs[I];
}
return &Args[0];
@@ -808,7 +826,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
return std::move(Err);
// Setup the global device memory pool if needed.
- if (shouldSetupDeviceMemoryPool()) {
+ if (!RecordReplay.isReplaying() && shouldSetupDeviceMemoryPool()) {
uint64_t HeapSize;
auto SizeOrErr = getDeviceHeapSize(HeapSize);
if (SizeOrErr) {
@@ -1413,21 +1431,9 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
GenericKernelTy &GenericKernel =
*reinterpret_cast<GenericKernelTy *>(EntryPtr);
- if (RecordReplay.isRecording()) {
- RecordReplay.saveImage(GenericKernel.getName(), GenericKernel.getImage());
- RecordReplay.saveKernelInput(GenericKernel.getName(),
- GenericKernel.getImage());
- }
-
auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
AsyncInfoWrapper);
- if (RecordReplay.isRecording())
- RecordReplay.saveKernelDescr(GenericKernel.getName(), ArgPtrs, ArgOffsets,
- KernelArgs.NumArgs, KernelArgs.NumTeams[0],
- KernelArgs.ThreadLimit[0],
- KernelArgs.Tripcount);
-
// 'finalize' here to guarantee next record-replay actions are in-sync
AsyncInfoWrapper.finalize(Err);
More information about the Openmp-commits
mailing list