[llvm] [Offload] Implement `olShutDown` (PR #144055)
Ross Brunton via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 30 03:13:30 PDT 2025
https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/144055
>From 364ad9a4c8261e249b925a20c2d78b72fdcfb92c Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Thu, 12 Jun 2025 10:47:19 +0100
Subject: [PATCH 1/4] [Offload] Implement `olShutDown`
`olShutDown` was not properly calling deinit on the platforms, resulting
in random segfaults on AMD devices.
---
offload/liboffload/API/Common.td | 2 +-
offload/liboffload/src/OffloadImpl.cpp | 37 ++++++++++++++++----
offload/unittests/OffloadAPI/init/olInit.cpp | 12 +++++++
3 files changed, 44 insertions(+), 7 deletions(-)
diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index 79c3bd46f1984..669dfd3cca7c6 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -176,7 +176,7 @@ def : Function {
let desc = "Release the resources in use by Offload";
let details = [
"This decrements an internal reference count. When this reaches 0, all resources will be released",
- "Subsequent API calls made after this are not valid"
+ "Subsequent API calls to methods other than `olInit` made after resources are released will return OL_ERRC_UNINITIALIZED"
];
let params = [];
let returns = [];
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 6adebb25a2db0..1719c896a90f3 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -97,6 +97,7 @@ struct AllocInfo {
// Global shared state for liboffload
struct OffloadContext;
static OffloadContext *OffloadContextVal;
+std::mutex OffloadContextValMutex;
struct OffloadContext {
OffloadContext(OffloadContext &) = delete;
OffloadContext(OffloadContext &&) = delete;
@@ -107,6 +108,7 @@ struct OffloadContext {
bool ValidationEnabled = true;
DenseMap<void *, AllocInfo> AllocInfoMap{};
SmallVector<ol_platform_impl_t, 4> Platforms{};
+ size_t RefCount;
ol_device_handle_t HostDevice() {
// The host platform is always inserted last
@@ -191,18 +193,41 @@ Error initPlugins() {
return Plugin::success();
}
-// TODO: We can properly reference count here and manage the resources in a more
-// clever way
Error olInit_impl() {
- static std::once_flag InitFlag;
- std::optional<Error> InitResult{};
- std::call_once(InitFlag, [&] { InitResult = initPlugins(); });
+ std::lock_guard<std::mutex> Lock{OffloadContextValMutex};
+
+ std::optional<Error> InitResult;
+ if (!isOffloadInitialized())
+ InitResult = initPlugins();
+
+ OffloadContext::get().RefCount++;
if (InitResult)
return std::move(*InitResult);
return Error::success();
}
-Error olShutDown_impl() { return Error::success(); }
+
+Error olShutDown_impl() {
+ std::lock_guard<std::mutex> Lock{OffloadContextValMutex};
+
+ if (--OffloadContext::get().RefCount != 0)
+ return Error::success();
+
+ llvm::Error Result = Error::success();
+
+ for (auto &P : OffloadContext::get().Platforms) {
+ // Host plugin is nullptr and has no deinit
+ if (!P.Plugin)
+ continue;
+
+ if (auto Res = P.Plugin->deinit())
+ Result = llvm::joinErrors(std::move(Result), std::move(Res));
+ }
+ delete OffloadContextVal;
+ OffloadContextVal = nullptr;
+
+ return Result;
+}
Error olGetPlatformInfoImplDetail(ol_platform_handle_t Platform,
ol_platform_info_t PropName, size_t PropSize,
diff --git a/offload/unittests/OffloadAPI/init/olInit.cpp b/offload/unittests/OffloadAPI/init/olInit.cpp
index 8e27e77cd0fb5..508615152b4f1 100644
--- a/offload/unittests/OffloadAPI/init/olInit.cpp
+++ b/offload/unittests/OffloadAPI/init/olInit.cpp
@@ -15,8 +15,20 @@
struct olInitTest : ::testing::Test {};
+TEST_F(olInitTest, Success) {
+ ASSERT_SUCCESS(olInit());
+ ASSERT_SUCCESS(olShutDown());
+}
+
TEST_F(olInitTest, Uninitialized) {
ASSERT_ERROR(OL_ERRC_UNINITIALIZED,
olIterateDevices(
[](ol_device_handle_t, void *) { return false; }, nullptr));
}
+
+TEST_F(olInitTest, RepeatedInit) {
+ for (size_t I = 0; I < 10; I++) {
+ ASSERT_SUCCESS(olInit());
+ ASSERT_SUCCESS(olShutDown());
+ }
+}
>From 42ee27834bc6ddd22e93d3a19fe3f75bb642f63c Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 27 Jun 2025 16:58:34 +0100
Subject: [PATCH 2/4] Move init to olInit
---
offload/liboffload/src/OffloadImpl.cpp | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 1719c896a90f3..9cf52171181f4 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -148,19 +148,19 @@ constexpr ol_platform_backend_t pluginNameToBackend(StringRef Name) {
#include "Shared/Targets.def"
Error initPlugins() {
- auto *Context = new OffloadContext{};
+ auto &Context = OffloadContext::get();
// Attempt to create an instance of each supported plugin.
#define PLUGIN_TARGET(Name) \
do { \
- Context->Platforms.emplace_back(ol_platform_impl_t{ \
+ Context.Platforms.emplace_back(ol_platform_impl_t{ \
std::unique_ptr<GenericPluginTy>(createPlugin_##Name()), \
pluginNameToBackend(#Name)}); \
} while (false);
#include "Shared/Targets.def"
// Preemptively initialize all devices in the plugin
- for (auto &Platform : Context->Platforms) {
+ for (auto &Platform : Context.Platforms) {
// Do not use the host plugin - it isn't supported.
if (Platform.BackendType == OL_PLATFORM_BACKEND_UNKNOWN)
continue;
@@ -180,15 +180,13 @@ Error initPlugins() {
}
// Add the special host device
- auto &HostPlatform = Context->Platforms.emplace_back(
+ auto &HostPlatform = Context.Platforms.emplace_back(
ol_platform_impl_t{nullptr, OL_PLATFORM_BACKEND_HOST});
HostPlatform.Devices.emplace_back(-1, nullptr, nullptr, InfoTreeNode{});
- Context->HostDevice()->Platform = &HostPlatform;
+ Context.HostDevice()->Platform = &HostPlatform;
- Context->TracingEnabled = std::getenv("OFFLOAD_TRACE");
- Context->ValidationEnabled = !std::getenv("OFFLOAD_DISABLE_VALIDATION");
-
- OffloadContextVal = Context;
+ Context.TracingEnabled = std::getenv("OFFLOAD_TRACE");
+ Context.ValidationEnabled = !std::getenv("OFFLOAD_DISABLE_VALIDATION");
return Plugin::success();
}
@@ -197,8 +195,10 @@ Error olInit_impl() {
std::lock_guard<std::mutex> Lock{OffloadContextValMutex};
std::optional<Error> InitResult;
- if (!isOffloadInitialized())
+ if (!isOffloadInitialized()) {
+ OffloadContextVal = new OffloadContext{};
InitResult = initPlugins();
+ }
OffloadContext::get().RefCount++;
>From 782d74d064822f1a0154bfdf1a36dee2ab4c8be6 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Mon, 30 Jun 2025 10:10:50 +0100
Subject: [PATCH 3/4] Reorder logic
---
offload/liboffload/src/OffloadImpl.cpp | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 9cf52171181f4..3d75dcf865624 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -194,17 +194,16 @@ Error initPlugins() {
Error olInit_impl() {
std::lock_guard<std::mutex> Lock{OffloadContextValMutex};
- std::optional<Error> InitResult;
- if (!isOffloadInitialized()) {
- OffloadContextVal = new OffloadContext{};
- InitResult = initPlugins();
+ if (isOffloadInitialized()) {
+ OffloadContext::get().RefCount++;
+ return Plugin::success();
}
+ OffloadContextVal = new OffloadContext{};
+ Error InitResult = initPlugins();
OffloadContext::get().RefCount++;
- if (InitResult)
- return std::move(*InitResult);
- return Error::success();
+ return InitResult;
}
Error olShutDown_impl() {
>From a304dd7fd614ae90c8f69b8d1ab61cc11d72e621 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Mon, 30 Jun 2025 11:13:13 +0100
Subject: [PATCH 4/4] Work around race condition
---
offload/liboffload/src/OffloadImpl.cpp | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 3d75dcf865624..9d4f4f54a8217 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -96,7 +96,9 @@ struct AllocInfo {
// Global shared state for liboffload
struct OffloadContext;
-static OffloadContext *OffloadContextVal;
+// This pointer is non-null if and only if the context is valid and fully
+// initialized
+static std::atomic<OffloadContext *> OffloadContextVal;
std::mutex OffloadContextValMutex;
struct OffloadContext {
OffloadContext(OffloadContext &) = delete;
@@ -147,9 +149,7 @@ constexpr ol_platform_backend_t pluginNameToBackend(StringRef Name) {
#define PLUGIN_TARGET(Name) extern "C" GenericPluginTy *createPlugin_##Name();
#include "Shared/Targets.def"
-Error initPlugins() {
- auto &Context = OffloadContext::get();
-
+Error initPlugins(OffloadContext &Context) {
// Attempt to create an instance of each supported plugin.
#define PLUGIN_TARGET(Name) \
do { \
@@ -199,8 +199,11 @@ Error olInit_impl() {
return Plugin::success();
}
- OffloadContextVal = new OffloadContext{};
- Error InitResult = initPlugins();
+ // Use a temporary to ensure that entry points querying OffloadContextVal do
+ // not get a partially initialized context
+ auto *NewContext = new OffloadContext{};
+ Error InitResult = initPlugins(*NewContext);
+ OffloadContextVal.store(NewContext);
OffloadContext::get().RefCount++;
return InitResult;
@@ -213,8 +216,9 @@ Error olShutDown_impl() {
return Error::success();
llvm::Error Result = Error::success();
+ auto *OldContext = OffloadContextVal.exchange(nullptr);
- for (auto &P : OffloadContext::get().Platforms) {
+ for (auto &P : OldContext->Platforms) {
// Host plugin is nullptr and has no deinit
if (!P.Plugin)
continue;
@@ -222,9 +226,8 @@ Error olShutDown_impl() {
if (auto Res = P.Plugin->deinit())
Result = llvm::joinErrors(std::move(Result), std::move(Res));
}
- delete OffloadContextVal;
- OffloadContextVal = nullptr;
+ delete OldContext;
return Result;
}
More information about the llvm-commits
mailing list