[llvm] [Offload] Full AMD support for olMemFill (PR #154958)
Ross Brunton via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 22 07:35:01 PDT 2025
https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/154958
>From 0d6ace813fe295d4f0d8a62eefc4a05db700ed57 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 19 Aug 2025 13:43:01 +0100
Subject: [PATCH 1/2] [Offload] Full AMD support for olMemFill
---
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 115 +++++++++++++----
.../unittests/OffloadAPI/common/Fixtures.hpp | 34 +++++
.../unittests/OffloadAPI/memory/olMemFill.cpp | 117 +++++++++++++-----
3 files changed, 212 insertions(+), 54 deletions(-)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 7ba55715ff58d..4cc158f326add 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -924,6 +924,7 @@ struct AMDGPUStreamTy {
void *Dst;
const void *Src;
size_t Size;
+ size_t NumTimes;
};
/// Utility struct holding arguments for freeing buffers to memory managers.
@@ -974,9 +975,14 @@ struct AMDGPUStreamTy {
StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
/// Schedule a host memory copy action on the slot.
- Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
+ ///
+ /// Num times will repeat the copy that many times, sequentually in the dest
+ /// buffer.
+ Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size,
+ size_t NumTimes = 1) {
Callbacks.emplace_back(memcpyAction);
- ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
+ ActionArgs.emplace_back().MemcpyArgs =
+ MemcpyArgsTy{Dst, Src, Size, NumTimes};
return Plugin::success();
}
@@ -1216,7 +1222,12 @@ struct AMDGPUStreamTy {
assert(Args->Dst && "Invalid destination buffer");
assert(Args->Src && "Invalid source buffer");
- std::memcpy(Args->Dst, Args->Src, Args->Size);
+ auto BasePtr = Args->Dst;
+ for (size_t I = 0; I < Args->NumTimes; I++) {
+ std::memcpy(BasePtr, Args->Src, Args->Size);
+ BasePtr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(BasePtr) +
+ Args->Size);
+ }
return Plugin::success();
}
@@ -1421,7 +1432,8 @@ struct AMDGPUStreamTy {
/// manager once the operation completes.
Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
uint64_t CopySize,
- AMDGPUMemoryManagerTy &MemoryManager) {
+ AMDGPUMemoryManagerTy &MemoryManager,
+ size_t NumTimes = 1) {
// Retrieve available signals for the operation's outputs.
AMDGPUSignalTy *OutputSignals[2] = {};
if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
@@ -1443,7 +1455,8 @@ struct AMDGPUStreamTy {
// The std::memcpy is done asynchronously using an async handler. We store
// the function's information in the action but it is not actually a
// post action.
- if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
+ if (auto Err =
+ Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize, NumTimes))
return Err;
// Make changes on this slot visible to the async handler's thread.
@@ -1464,7 +1477,12 @@ struct AMDGPUStreamTy {
std::tie(Curr, InputSignal) = consume(OutputSignal);
} else {
// All preceding operations completed, copy the memory synchronously.
- std::memcpy(Inter, Src, CopySize);
+ auto *InterPtr = Inter;
+ for (size_t I = 0; I < NumTimes; I++) {
+ std::memcpy(InterPtr, Src, CopySize);
+ InterPtr = reinterpret_cast<void *>(
+ reinterpret_cast<uintptr_t>(InterPtr) + CopySize);
+ }
// Return the second signal because it will not be used.
OutputSignals[1]->decreaseUseCount();
@@ -1481,11 +1499,11 @@ struct AMDGPUStreamTy {
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
- Agent, CopySize, 1, &InputSignalRaw,
- OutputSignal->get());
+ Agent, CopySize * NumTimes, 1,
+ &InputSignalRaw, OutputSignal->get());
}
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
- Agent, CopySize, 0, nullptr,
+ Agent, CopySize * NumTimes, 0, nullptr,
OutputSignal->get());
}
@@ -2611,26 +2629,73 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
- hsa_status_t Status;
+ // Fast case, where we can use the 4 byte hsa_amd_memory_fill
+ if (Size % 4 == 0 &&
+ (PatternSize == 4 || PatternSize == 2 || PatternSize == 1)) {
+ uint32_t Pattern;
+ if (PatternSize == 1) {
+ auto *Byte = reinterpret_cast<const uint8_t *>(PatternPtr);
+ Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24;
+ } else if (PatternSize == 2) {
+ auto *Word = reinterpret_cast<const uint16_t *>(PatternPtr);
+ Pattern = *Word | (*Word << 16);
+ } else if (PatternSize == 4) {
+ Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr);
+ } else {
+ // Shouldn't be here if the pattern size is outwith those values
+ std::terminate();
+ }
- // We can use hsa_amd_memory_fill for this size, but it's not async so the
- // queue needs to be synchronized first
- if (PatternSize == 4) {
- if (AsyncInfoWrapper.hasQueue())
- if (auto Err = synchronize(AsyncInfoWrapper))
+ if (hasPendingWorkImpl(AsyncInfoWrapper)) {
+ AMDGPUStreamTy *Stream = nullptr;
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
- Status = hsa_amd_memory_fill(TgtPtr,
- *static_cast<const uint32_t *>(PatternPtr),
- Size / PatternSize);
- if (auto Err =
- Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"))
- return Err;
- } else {
- // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
- // memory and copying to the device in one go.
- return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size");
+ struct MemFillArgsTy {
+ void *Dst;
+ uint32_t Pattern;
+ int64_t Size;
+ };
+ auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4};
+ auto Fill = [](void *Data) {
+ MemFillArgsTy *Args = reinterpret_cast<MemFillArgsTy *>(Data);
+ assert(Args && "Invalid arguments");
+
+ auto Status =
+ hsa_amd_memory_fill(Args->Dst, Args->Pattern, Args->Size);
+ delete Args;
+ auto Err =
+ Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
+ if (Err) {
+ FATAL_MESSAGE(1, "error performing async fill: %s",
+ toString(std::move(Err)).data());
+ }
+ };
+
+ // hsa_amd_memory_fill doesn't signal completion using a signal, so use
+ // the existing host callback logic to handle that instead
+ return Stream->pushHostCallback(Fill, Args);
+ } else {
+ // If there is no pending work, do the fill synchronously
+ auto Status = hsa_amd_memory_fill(TgtPtr, Pattern, Size / 4);
+ return Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
+ }
}
+
+ // Slow case; allocate an appropriate memory size and enqueue copies
+ void *PinnedPtr = nullptr;
+ AMDGPUMemoryManagerTy &PinnedMemoryManager =
+ HostDevice.getPinnedMemoryManager();
+ if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
+ return Err;
+
+ AMDGPUStreamTy *Stream = nullptr;
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+
+ return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr,
+ PatternSize, PinnedMemoryManager,
+ Size / PatternSize);
}
/// Initialize the async info for interoperability purposes.
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index fe7198a9c283f..0538e60f276e3 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -89,6 +89,40 @@ template <typename Fn> inline void threadify(Fn body) {
}
}
+/// Enqueues a task to the queue that can be manually resolved.
+// It will block until `trigger` is called.
+struct ManuallyTriggeredTask {
+ std::mutex M;
+ std::condition_variable CV;
+ bool Flag = false;
+ ol_event_handle_t CompleteEvent;
+
+ ol_result_t enqueue(ol_queue_handle_t Queue) {
+ if (auto Err = olLaunchHostFunction(
+ Queue,
+ [](void *That) {
+ static_cast<ManuallyTriggeredTask *>(That)->wait();
+ },
+ this))
+ return Err;
+
+ return olCreateEvent(Queue, &CompleteEvent);
+ }
+
+ void wait() {
+ std::unique_lock<std::mutex> lk(M);
+ CV.wait_for(lk, std::chrono::milliseconds(1000), [&] { return Flag; });
+ EXPECT_TRUE(Flag);
+ }
+
+ ol_result_t trigger() {
+ Flag = true;
+ CV.notify_one();
+
+ return olSyncEvent(CompleteEvent);
+ }
+};
+
struct OffloadTest : ::testing::Test {
ol_device_handle_t Host = TestEnvironment::getHostDevice();
};
diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
index 1b0bafa202080..a84ed3d78eccf 100644
--- a/offload/unittests/OffloadAPI/memory/olMemFill.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
@@ -10,75 +10,129 @@
#include <OffloadAPI.h>
#include <gtest/gtest.h>
-using olMemFillTest = OffloadQueueTest;
+struct olMemFillTest : OffloadQueueTest {
+ template <typename PatternTy, PatternTy PatternVal, size_t Size,
+ bool Block = false>
+ void test_body() {
+ ManuallyTriggeredTask Manual;
+
+ // Block/enqueue tests ensure that the test has been enqueued to a queue
+ // (rather than being done synchronously if the queue happens to be empty)
+ if constexpr (Block) {
+ ASSERT_SUCCESS(Manual.enqueue(Queue));
+ }
+
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ PatternTy Pattern = PatternVal;
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ if constexpr (Block) {
+ ASSERT_SUCCESS(Manual.trigger());
+ }
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ PatternTy *AllocPtr = reinterpret_cast<PatternTy *>(Alloc);
+ ASSERT_EQ(AllocPtr[i], Pattern);
+ }
+
+ olMemFree(Alloc);
+ }
+};
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest);
-TEST_P(olMemFillTest, Success8) {
- constexpr size_t Size = 1024;
- void *Alloc;
- ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
-
- uint8_t Pattern = 0x42;
- ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
-
- olSyncQueue(Queue);
+TEST_P(olMemFillTest, Success8) { test_body<uint8_t, 0x42, 1024>(); }
+TEST_P(olMemFillTest, Success8NotMultiple4) {
+ test_body<uint8_t, 0x42, 1023>();
+}
+TEST_P(olMemFillTest, Success8Enqueue) {
+ test_body<uint8_t, 0x42, 1024, true>();
+}
+TEST_P(olMemFillTest, Success8NotMultiple4Enqueue) {
+ test_body<uint8_t, 0x42, 1023, true>();
+}
- size_t N = Size / sizeof(Pattern);
- for (size_t i = 0; i < N; i++) {
- uint8_t *AllocPtr = reinterpret_cast<uint8_t *>(Alloc);
- ASSERT_EQ(AllocPtr[i], Pattern);
- }
+TEST_P(olMemFillTest, Success16) { test_body<uint8_t, 0x42, 1024>(); }
+TEST_P(olMemFillTest, Success16NotMultiple4) {
+ test_body<uint16_t, 0x4243, 1022>();
+}
+TEST_P(olMemFillTest, Success16Enqueue) {
+ test_body<uint8_t, 0x42, 1024, true>();
+}
+TEST_P(olMemFillTest, Success16NotMultiple4Enqueue) {
+ test_body<uint16_t, 0x4243, 1022, true>();
+}
- olMemFree(Alloc);
+TEST_P(olMemFillTest, Success32) { test_body<uint32_t, 0xDEADBEEF, 1024>(); }
+TEST_P(olMemFillTest, Success32Enqueue) {
+ test_body<uint32_t, 0xDEADBEEF, 1024, true>();
}
-TEST_P(olMemFillTest, Success16) {
+TEST_P(olMemFillTest, SuccessLarge) {
constexpr size_t Size = 1024;
void *Alloc;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
- uint16_t Pattern = 0x4242;
+ struct PatternT {
+ uint64_t A;
+ uint64_t B;
+ } Pattern{UINT64_MAX, UINT64_MAX};
+
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
olSyncQueue(Queue);
size_t N = Size / sizeof(Pattern);
for (size_t i = 0; i < N; i++) {
- uint16_t *AllocPtr = reinterpret_cast<uint16_t *>(Alloc);
- ASSERT_EQ(AllocPtr[i], Pattern);
+ PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+ ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
}
olMemFree(Alloc);
}
-TEST_P(olMemFillTest, Success32) {
+TEST_P(olMemFillTest, SuccessLargeEnqueue) {
constexpr size_t Size = 1024;
void *Alloc;
+ ManuallyTriggeredTask Manual;
+ ASSERT_SUCCESS(Manual.enqueue(Queue));
+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
- uint32_t Pattern = 0xDEADBEEF;
+ struct PatternT {
+ uint64_t A;
+ uint64_t B;
+ } Pattern{UINT64_MAX, UINT64_MAX};
+
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+ Manual.trigger();
olSyncQueue(Queue);
size_t N = Size / sizeof(Pattern);
for (size_t i = 0; i < N; i++) {
- uint32_t *AllocPtr = reinterpret_cast<uint32_t *>(Alloc);
- ASSERT_EQ(AllocPtr[i], Pattern);
+ PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+ ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
}
olMemFree(Alloc);
}
-TEST_P(olMemFillTest, SuccessLarge) {
- constexpr size_t Size = 1024;
+TEST_P(olMemFillTest, SuccessLargeByteAligned) {
+ constexpr size_t Size = 17 * 64;
void *Alloc;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
- struct PatternT {
+ struct __attribute__((packed)) PatternT {
uint64_t A;
uint64_t B;
- } Pattern{UINT64_MAX, UINT64_MAX};
+ uint8_t C;
+ } Pattern{UINT64_MAX, UINT64_MAX, 255};
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
@@ -89,14 +143,18 @@ TEST_P(olMemFillTest, SuccessLarge) {
PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].C, 255);
}
olMemFree(Alloc);
}
-TEST_P(olMemFillTest, SuccessLargeByteAligned) {
+TEST_P(olMemFillTest, SuccessLargeByteAlignedEnqueue) {
constexpr size_t Size = 17 * 64;
void *Alloc;
+ ManuallyTriggeredTask Manual;
+ ASSERT_SUCCESS(Manual.enqueue(Queue));
+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
struct __attribute__((packed)) PatternT {
@@ -107,6 +165,7 @@ TEST_P(olMemFillTest, SuccessLargeByteAligned) {
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+ Manual.trigger();
olSyncQueue(Queue);
size_t N = Size / sizeof(Pattern);
>From 1eee8e094d4e810d3c0485861fbf57a7b5a503f6 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 22 Aug 2025 15:34:46 +0100
Subject: [PATCH 2/2] Change to unreachable
---
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 4cc158f326add..41e963685eef1 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2643,7 +2643,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr);
} else {
// Shouldn't be here if the pattern size is outwith those values
- std::terminate();
+ llvm_unreachable("Invalid pattern size");
}
if (hasPendingWorkImpl(AsyncInfoWrapper)) {
More information about the llvm-commits
mailing list