[llvm] WIP: [Offload] Implement olMemFill (PR #154102)
Callum Fare via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 18 04:43:55 PDT 2025
https://github.com/callumfare updated https://github.com/llvm/llvm-project/pull/154102
>From 0c1583cb68fcee8d845e7d3dfb13e9a6e211e978 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Thu, 7 Aug 2025 13:12:55 +0100
Subject: [PATCH] [Offload] Implement olMemFill
---
offload/liboffload/API/Memory.td | 20 +++
offload/liboffload/src/OffloadImpl.cpp | 6 +
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 24 ++++
.../common/include/PluginInterface.h | 7 +
.../common/src/PluginInterface.cpp | 10 ++
.../cuda/dynamic_cuda/cuda.cpp | 7 +
.../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 10 ++
offload/plugins-nextgen/cuda/src/rtl.cpp | 52 +++++++
offload/plugins-nextgen/host/src/rtl.cpp | 6 +
offload/unittests/OffloadAPI/CMakeLists.txt | 1 +
.../unittests/OffloadAPI/memory/olMemFill.cpp | 134 ++++++++++++++++++
11 files changed, 277 insertions(+)
create mode 100644 offload/unittests/OffloadAPI/memory/olMemFill.cpp
diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index 5f7158588bc77..82f942b2e06c5 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -63,3 +63,23 @@ def : Function {
];
let returns = [];
}
+
+def : Function {
+ let name = "olMemFill";
+ let desc = "Fill memory with copies of the given pattern";
+ let details = [
+ "Filling with patterns larger than 4 bytes may be less performant",
+ "The destination pointer and queue must be associated with the same device",
+ "The fill size must be a multiple of the pattern size",
+ ];
+ let params = [
+ Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
+ Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>,
+ Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>,
+ Param<"void*", "PatternPtr", "", PARAM_IN>,
+ Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>,
+ ];
+ let returns = [
+ Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]>
+ ];
+}
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 1c9dfc69d445a..0b6363ab6ffbf 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -656,6 +656,12 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
return Error::success();
}
+Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
+ void *PatternPtr, size_t FillSize) {
+ return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
+ Queue->AsyncInfo);
+}
+
Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
size_t ProgDataSize, ol_program_handle_t *Program) {
// Make a copy of the program binary in case it is released by the caller.
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 83280fe0a49c9..1fae38b925c61 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2583,6 +2583,30 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
+ Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+ int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ hsa_status_t Status;
+
+ // We can use hsa_amd_memory_fill for this size, but it's not async so the
+ // queue needs to be synchronized first
+ if (PatternSize == 4) {
+ if (AsyncInfoWrapper.hasQueue())
+ if (auto Err = synchronize(AsyncInfoWrapper))
+ return Err;
+ Status = hsa_amd_memory_fill(TgtPtr, *(uint32_t *)(TgtPtr),
+ Size / PatternSize);
+
+ if (auto Err =
+ Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"))
+ return Err;
+ } else {
+ // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
+ // memory and copying to the device in one go.
+ return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size");
+ }
+ }
+
/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
// TODO: Implement this function.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index a448721755a6f..b2145979ae599 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -957,6 +957,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ /// Fill data on the device with a pattern from the host
+ Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+ int64_t Size, __tgt_async_info *AsyncInfo);
+ virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr,
+ int64_t PatternSize, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
/// Run the kernel associated with \p EntryPtr
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index c06c35e1e6a5b..fb672c9782a8b 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1540,6 +1540,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
return Err;
}
+Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr,
+ int64_t PatternSize, int64_t Size,
+ __tgt_async_info *AsyncInfo) {
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+ auto Err =
+ dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper);
+ AsyncInfoWrapper.finalize(Err);
+ return Err;
+}
+
Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs,
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 361a781e8f9b6..e8da25bc1d155 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4)
DLWRAP(cuMemcpyHtoD, 3)
DLWRAP(cuMemcpyHtoDAsync, 4)
+DLWRAP(cuMemsetD8Async, 4)
+DLWRAP(cuMemsetD16Async, 4)
+DLWRAP(cuMemsetD32Async, 4)
+DLWRAP(cuMemsetD2D8Async, 6)
+DLWRAP(cuMemsetD2D16Async, 6)
+DLWRAP(cuMemsetD2D32Async, 6)
+
DLWRAP(cuMemFree, 1)
DLWRAP(cuMemFreeHost, 1)
DLWRAP(cuMemFreeAsync, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index b6c022c8e7e8b..93496d95327f3 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -321,6 +321,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
+CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+ CUstream);
+CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+ CUstream);
+CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+ CUstream);
+
CUresult cuMemFree(CUdeviceptr);
CUresult cuMemFreeHost(void *);
CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index a99357a3adeaa..70020a6581a5c 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -844,6 +844,58 @@ struct CUDADeviceTy : public GenericDeviceTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+ Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+ int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ if (auto Err = setContext())
+ return Err;
+
+ CUstream Stream;
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+
+ CUresult Res;
+ size_t N = Size / PatternSize;
+ if (PatternSize == 1) {
+ Res = cuMemsetD8Async((CUdeviceptr)TgtPtr, *((const uint8_t *)PatternPtr),
+ N, Stream);
+ } else if (PatternSize == 2) {
+ Res = cuMemsetD16Async((CUdeviceptr)TgtPtr,
+ *((const uint16_t *)PatternPtr), N, Stream);
+ } else if (PatternSize == 4) {
+ Res = cuMemsetD32Async((CUdeviceptr)TgtPtr,
+ *((const uint32_t *)PatternPtr), N, Stream);
+ } else {
+ // For larger patterns we can do a series of strided fills to copy the
+ // pattern efficiently
+ int64_t MemsetSize = PatternSize % 4u == 0u ? 4u
+ : PatternSize % 2u == 0u ? 2u
+ : 1u;
+
+ int64_t NumberOfSteps = PatternSize / MemsetSize;
+ int64_t Pitch = NumberOfSteps * MemsetSize;
+ int64_t Height = Size / PatternSize;
+
+ for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
+ if (MemsetSize == 4) {
+ Res = cuMemsetD2D32Async(
+ (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+ *((const uint32_t *)PatternPtr + Step), 1u, Height, Stream);
+ } else if (MemsetSize == 2) {
+ Res = cuMemsetD2D16Async(
+ (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+ *((const uint16_t *)PatternPtr + Step), 1u, Height, Stream);
+ } else {
+ Res = cuMemsetD2D8Async((CUdeviceptr)TgtPtr + Step * MemsetSize,
+ Pitch, *((const uint8_t *)PatternPtr + Step),
+ 1u, Height, Stream);
+ }
+ }
+ }
+
+ return Plugin::check(Res, "error in cuMemset: %s");
+ }
+
/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
if (auto Err = setContext())
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 25443fd1ac0b3..e830f2ccabc62 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -302,6 +302,12 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return Plugin::success();
}
+ Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+ int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ return Plugin::error(ErrorCode::UNSUPPORTED, "dataFillImpl not supported");
+ }
+
/// All functions are already synchronous. No need to do anything on this
/// synchronization function.
Error synchronizeImpl(__tgt_async_info &AsyncInfo,
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index b25db7022e9d7..58c9b89d1ed0d 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -24,6 +24,7 @@ add_offload_unittest("kernel"
add_offload_unittest("memory"
memory/olMemAlloc.cpp
+ memory/olMemFill.cpp
memory/olMemFree.cpp
memory/olMemcpy.cpp)
diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
new file mode 100644
index 0000000000000..1b0bafa202080
--- /dev/null
+++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
@@ -0,0 +1,134 @@
+//===------- Offload API tests - olMemFill --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olMemFillTest = OffloadQueueTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest);
+
+TEST_P(olMemFillTest, Success8) {
+ constexpr size_t Size = 1024;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ uint8_t Pattern = 0x42;
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ uint8_t *AllocPtr = reinterpret_cast<uint8_t *>(Alloc);
+ ASSERT_EQ(AllocPtr[i], Pattern);
+ }
+
+ olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, Success16) {
+ constexpr size_t Size = 1024;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ uint16_t Pattern = 0x4242;
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ uint16_t *AllocPtr = reinterpret_cast<uint16_t *>(Alloc);
+ ASSERT_EQ(AllocPtr[i], Pattern);
+ }
+
+ olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, Success32) {
+ constexpr size_t Size = 1024;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ uint32_t Pattern = 0xDEADBEEF;
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ uint32_t *AllocPtr = reinterpret_cast<uint32_t *>(Alloc);
+ ASSERT_EQ(AllocPtr[i], Pattern);
+ }
+
+ olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLarge) {
+ constexpr size_t Size = 1024;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ struct PatternT {
+ uint64_t A;
+ uint64_t B;
+ } Pattern{UINT64_MAX, UINT64_MAX};
+
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+ ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+ }
+
+ olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLargeByteAligned) {
+ constexpr size_t Size = 17 * 64;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ struct __attribute__((packed)) PatternT {
+ uint64_t A;
+ uint64_t B;
+ uint8_t C;
+ } Pattern{UINT64_MAX, UINT64_MAX, 255};
+
+ ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+
+ size_t N = Size / sizeof(Pattern);
+ for (size_t i = 0; i < N; i++) {
+ PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+ ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+ ASSERT_EQ(AllocPtr[i].C, 255);
+ }
+
+ olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, InvalidPatternSize) {
+ constexpr size_t Size = 1025;
+ void *Alloc;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+ uint16_t Pattern = 0x4242;
+ ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+ olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+ olSyncQueue(Queue);
+ olMemFree(Alloc);
+}
More information about the llvm-commits
mailing list