[clang] [llvm] [Offload] Provide a kernel library useable by the offload runtime (PR #104168)
Johannes Doerfert via cfe-commits
cfe-commits at lists.llvm.org
Wed Aug 14 16:44:16 PDT 2024
https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/104168
>From 73daf3f9968f07e7f52b6e2ed956773d873634f3 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 12 Aug 2024 11:53:06 -0700
Subject: [PATCH] [Offload] Provide a kernel library useable by the offload
runtime
As mentioned in #68706, it is useful to be able to call kernels from the
runtime, e.g., to perform memset. This patch provides a kernel library
that can be invoked from the offload runtime directly.
---
clang/lib/Driver/ToolChains/CommonArgs.cpp | 7 +-
offload/DeviceRTL/CMakeLists.txt | 1 +
offload/include/device.h | 3 +
offload/include/omptarget.h | 5 +
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 56 ++-------
.../common/include/PluginInterface.h | 33 ++++--
.../common/src/PluginInterface.cpp | 109 ++++++++++++++++--
offload/plugins-nextgen/cuda/src/rtl.cpp | 33 +++---
offload/plugins-nextgen/host/src/rtl.cpp | 5 +-
offload/src/CMakeLists.txt | 18 +++
offload/src/Kernels/Memory.cpp | 53 +++++++++
offload/src/OpenMP/API.cpp | 65 ++++++++---
offload/src/device.cpp | 11 ++
offload/src/exports | 1 +
offload/src/interface.cpp | 15 +++
offload/test/jit/type_punning.c | 4 +-
offload/test/lit.cfg | 5 +-
offload/test/offloading/kernels_memset.c | 61 ++++++++++
18 files changed, 370 insertions(+), 115 deletions(-)
create mode 100644 offload/src/Kernels/Memory.cpp
create mode 100644 offload/test/offloading/kernels_memset.c
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 1cba3e1220264a..4080356c636dc2 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1202,8 +1202,11 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
options::OPT_fno_openmp, false)) {
// We need libomptarget (liboffload) if it's the choosen offloading runtime.
if (Args.hasFlag(options::OPT_foffload_via_llvm,
- options::OPT_fno_offload_via_llvm, false))
+ options::OPT_fno_offload_via_llvm, false)) {
CmdArgs.push_back("-lomptarget");
+ if (!Args.hasArg(options::OPT_nogpulib))
+ CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"});
+ }
return false;
}
@@ -1240,7 +1243,7 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
CmdArgs.push_back("-lomptarget");
if (IsOffloadingHost && !Args.hasArg(options::OPT_nogpulib))
- CmdArgs.push_back("-lomptarget.devicertl");
+ CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"});
addArchSpecificRPath(TC, Args, CmdArgs);
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 7818c8d752599c..e321047f781a3e 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -69,6 +69,7 @@ elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
"${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
endif()
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
+set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_DEVICE_ARCHITECTURES} PARENT_SCOPE)
set(include_files
${include_directory}/Allocator.h
diff --git a/offload/include/device.h b/offload/include/device.h
index 3132d35b7b38c8..d3415785708d62 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -124,6 +124,9 @@ struct DeviceTy {
/// Calls the corresponding print device info function in the plugin.
bool printDeviceInfo();
+ /// Return the handle to the kernel with name \p Name in \p HandlePtr.
+ int32_t getKernelHandle(llvm::StringRef Name, void **HandlePtr);
+
/// Event related interfaces.
/// {
/// Create an event.
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 2b6445e9fbe550..f4ff5d33f7bf0f 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -400,6 +400,11 @@ void __tgt_target_data_update_nowait_mapper(
int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args);
+/// Launch the kernel \p KernelName with a CUDA style launch and the given grid
+/// sizes and arguments (\p KernelArgs).
+int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId, const char *KernelName,
+ KernelArgsTy *KernelArgs);
+
// Non-blocking synchronization for target nowait regions. This function
// acquires the asynchronous context from task data of the current task being
// executed and tries to query for the completion of its operations. If the
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 604683370cd27d..5d135795170563 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2016,20 +2016,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
- virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
- DeviceImageTy &Image) override {
- GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
- if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
- Image.setPendingGlobalDtors();
-
- return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
+ virtual Expected<StringRef>
+ getGlobalConstructorName(DeviceImageTy &Image) override {
+ return "amdgcn.device.init";
}
-
- virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
- DeviceImageTy &Image) override {
- if (Image.hasPendingGlobalDtors())
- return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
- return Plugin::success();
+ virtual Expected<StringRef>
+ getGlobalDestructorName(DeviceImageTy &Image) override {
+ return "amdgcn.device.fini";
}
uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
@@ -2107,13 +2100,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
uint64_t getClockFrequency() const override { return ClockFrequency; }
/// Allocate and construct an AMDGPU kernel.
- Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+ Expected<GenericKernelTy &>
+ constructKernelImpl(llvm::StringRef Name) override {
// Allocate and construct the AMDGPU kernel.
AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
if (!AMDGPUKernel)
return Plugin::error("Failed to allocate memory for AMDGPU kernel");
- new (AMDGPUKernel) AMDGPUKernelTy(Name);
+ new (AMDGPUKernel) AMDGPUKernelTy(Name.data());
return *AMDGPUKernel;
}
@@ -2791,38 +2785,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
- /// Common method to invoke a single threaded constructor or destructor
- /// kernel by name.
- Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
- bool IsCtor) {
- const char *KernelName =
- IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
- // Perform a quick check for the named kernel in the image. The kernel
- // should be created by the 'amdgpu-lower-ctor-dtor' pass.
- GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
- if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
- return Plugin::success();
-
- // Allocate and construct the AMDGPU kernel.
- AMDGPUKernelTy AMDGPUKernel(KernelName);
- if (auto Err = AMDGPUKernel.init(*this, Image))
- return Err;
-
- AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
-
- KernelArgsTy KernelArgs = {};
- if (auto Err =
- AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
- /*NumBlocks=*/1ul, KernelArgs,
- KernelLaunchParamsTy{}, AsyncInfoWrapper))
- return Err;
-
- Error Err = Plugin::success();
- AsyncInfoWrapper.finalize(Err);
-
- return Err;
- }
-
/// Detect if current architecture is an APU.
Error checkIfAPU() {
// TODO: replace with ROCr API once it becomes available.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 81823338fe2112..08bdb0d75a464f 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -722,18 +722,17 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error synchronize(__tgt_async_info *AsyncInfo);
virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
- /// Invokes any global constructors on the device if present and is required
- /// by the target.
- virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
- DeviceImageTy &Image) {
- return Error::success();
+ /// Call the ctor/dtor of image \p Image, if available.
+ Error callGlobalCtorDtor(DeviceImageTy &Image, bool IsCtor);
+
+ /// Return the name of the global constructors on the device.
+ virtual Expected<StringRef> getGlobalConstructorName(DeviceImageTy &Image) {
+ return "";
}
- /// Invokes any global destructors on the device if present and is required
- /// by the target.
- virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
- DeviceImageTy &Image) {
- return Error::success();
+ /// Return the name of the global destructors on the device.
+ virtual Expected<StringRef> getGlobalDestructorName(DeviceImageTy &Image) {
+ return "";
}
/// Query for the completion of the pending operations on the __tgt_async_info
@@ -928,8 +927,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
bool useAutoZeroCopy();
virtual bool useAutoZeroCopyImpl() { return false; }
- /// Allocate and construct a kernel object.
- virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
+ /// Retrieve the kernel with name \p Name from image \p Image (or any image if
+ /// \p Image is null) and return it. If \p Optional is true, the function
+ /// returns success if there is no kernel with the given name.
+ Expected<GenericKernelTy *> getKernel(llvm::StringRef Name,
+ DeviceImageTy *Image = nullptr,
+ bool Optional = false);
/// Reference to the underlying plugin that created this device.
GenericPluginTy &Plugin;
@@ -947,6 +950,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
private:
+ /// Allocate and construct a kernel object (users should use getKernel).
+ virtual Expected<GenericKernelTy &>
+ constructKernelImpl(llvm::StringRef Name) = 0;
+
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
/// value to zero for the getters.
@@ -1046,6 +1053,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
private:
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
+
+ DenseMap<StringRef, GenericKernelTy *> KernelMap;
};
/// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 84d946507ea74a..dfc5687b82ee1b 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -36,6 +36,7 @@
#include <cstdint>
#include <limits>
+#include <string>
using namespace llvm;
using namespace omp;
@@ -809,7 +810,7 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
for (DeviceImageTy *Image : LoadedImages)
- if (auto Err = callGlobalDestructors(Plugin, *Image))
+ if (auto Err = callGlobalCtorDtor(*Image, /*Ctor*/ false))
return Err;
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
@@ -866,6 +867,37 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
return deinitImpl();
}
+
+Error GenericDeviceTy::callGlobalCtorDtor(DeviceImageTy &Image, bool IsCtor) {
+ auto NameOrErr =
+ IsCtor ? getGlobalConstructorName(Image) : getGlobalDestructorName(Image);
+ if (auto Err = NameOrErr.takeError())
+ return Err;
+ // No error but no name, that means there is no ctor/dtor.
+ if (NameOrErr->empty())
+ return Plugin::success();
+
+ auto KernelOrErr = getKernel(*NameOrErr, &Image, /*Optional=*/true);
+ if (auto Err = KernelOrErr.takeError())
+ return Err;
+
+ if (GenericKernelTy *Kernel = *KernelOrErr) {
+ KernelArgsTy KernelArgs;
+ KernelArgs.NumTeams[0] = KernelArgs.ThreadLimit[0] = 1;
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
+ if (auto Err = Kernel->launch(*this, /*ArgPtrs=*/nullptr,
+ /*ArgOffsets=*/nullptr, KernelArgs,
+ AsyncInfoWrapper))
+ return Err;
+
+ Error Err = Plugin::success();
+ AsyncInfoWrapper.finalize(Err);
+ return Err;
+ }
+
+ return Plugin::success();
+}
+
Expected<DeviceImageTy *>
GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
const __tgt_device_image *InputTgtImage) {
@@ -927,8 +959,8 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
#endif
// Call any global constructors present on the device.
- if (auto Err = callGlobalConstructors(Plugin, *Image))
- return std::move(Err);
+ if (auto Err = callGlobalCtorDtor(*Image, /*Ctor*/ true))
+ return Err;
// Return the pointer to the table of entries.
return Image;
@@ -1533,6 +1565,67 @@ Error GenericDeviceTy::printInfo() {
return Plugin::success();
}
+Expected<GenericKernelTy *> GenericDeviceTy::getKernel(llvm::StringRef Name,
+ DeviceImageTy *ImagePtr,
+ bool Optional) {
+ bool KernelFound = false;
+ GenericKernelTy *&KernelPtr = KernelMap[Name];
+ if (!KernelPtr) {
+ GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
+
+ auto CheckImage = [&](DeviceImageTy &Image) -> GenericKernelTy * {
+ if (!GHandler.isSymbolInImage(*this, Image, Name))
+ return nullptr;
+ KernelFound = true;
+
+ auto KernelOrErr = constructKernelImpl(Name);
+ if (Error Err = KernelOrErr.takeError()) {
+ [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+ DP("Failed to construct kernel ('%s'): %s", Name.data(),
+ ErrStr.c_str());
+ return nullptr;
+ }
+
+ GenericKernelTy &Kernel = *KernelOrErr;
+ if (auto Err = Kernel.init(*this, Image)) {
+ [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+ DP("Failed to initialize kernel ('%s'): %s", Name.data(),
+ ErrStr.c_str());
+ return nullptr;
+ }
+
+ return &Kernel;
+ };
+
+ if (ImagePtr) {
+ KernelPtr = CheckImage(*ImagePtr);
+ } else {
+ for (DeviceImageTy *Image : LoadedImages) {
+ KernelPtr = CheckImage(*Image);
+ if (KernelPtr)
+ break;
+ }
+ }
+ }
+
+ // If we didn't find the kernel and it was optional, we do not emit an error.
+ if (!KernelPtr && !KernelFound && Optional)
+ return nullptr;
+ // If we didn't find the kernel and it was not optional, we will emit an
+ // error.
+ if (!KernelPtr && !KernelFound)
+ return Plugin::error(
+ "Kernel '%s' not found%s", Name.data(),
+ ImagePtr
+ ? ""
+ : ", searched " + std::to_string(LoadedImages.size()) + " images");
+ // If we found the kernel but couldn't initialize it, we will emit an error.
+ if (!KernelPtr)
+ return Plugin::error("Kernel '%s' failed to initialize");
+ // Found the kernel and initialized it.
+ return KernelPtr;
+}
+
Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
return createEventImpl(EventPtrStorage);
}
@@ -2147,20 +2240,14 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
GenericDeviceTy &Device = Image.getDevice();
- auto KernelOrErr = Device.constructKernel(Name);
+ auto KernelOrErr = Device.getKernel(Name, &Image);
if (Error Err = KernelOrErr.takeError()) {
REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
return OFFLOAD_FAIL;
}
- GenericKernelTy &Kernel = *KernelOrErr;
- if (auto Err = Kernel.init(Device, Image)) {
- REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
- return OFFLOAD_FAIL;
- }
-
// Note that this is not the kernel's device address.
- *KernelPtr = &Kernel;
+ *KernelPtr = *KernelOrErr;
return OFFLOAD_SUCCESS;
}
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b6465d61bd033f..2a7b6f844cca05 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -393,22 +393,17 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::success();
}
- virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
- DeviceImageTy &Image) override {
- // Check for the presense of global destructors at initialization time. This
- // is required when the image may be deallocated before destructors are run.
- GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
- if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
- Image.setPendingGlobalDtors();
-
- return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
+ virtual Expected<StringRef>
+ getGlobalConstructorName(DeviceImageTy &Image) override {
+ if (auto Err = prepareGlobalCtorDtorCommon(Image, /*IsCtor=*/true))
+ return Err;
+ return "nvptx$device$init";
}
-
- virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
- DeviceImageTy &Image) override {
- if (Image.hasPendingGlobalDtors())
- return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
- return Plugin::success();
+ virtual Expected<StringRef>
+ getGlobalDestructorName(DeviceImageTy &Image) override {
+ if (auto Err = prepareGlobalCtorDtorCommon(Image, /*IsCtor=*/false))
+ return Err;
+ return "nvptx$device$fini";
}
Expected<std::unique_ptr<MemoryBuffer>>
@@ -471,13 +466,14 @@ struct CUDADeviceTy : public GenericDeviceTy {
}
/// Allocate and construct a CUDA kernel.
- Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+ Expected<GenericKernelTy &>
+ constructKernelImpl(llvm::StringRef Name) override {
// Allocate and construct the CUDA kernel.
CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
if (!CUDAKernel)
return Plugin::error("Failed to allocate memory for CUDA kernel");
- new (CUDAKernel) CUDAKernelTy(Name);
+ new (CUDAKernel) CUDAKernelTy(Name.data());
return *CUDAKernel;
}
@@ -1149,8 +1145,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
- Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
- bool IsCtor) {
+ Error prepareGlobalCtorDtorCommon(DeviceImageTy &Image, bool IsCtor) {
const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini";
// Perform a quick check for the named kernel in the image. The kernel
// should be created by the 'nvptx-lower-ctor-dtor' pass.
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index fe296b77c7d557..604b2648b6d629 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -151,13 +151,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
std::string getComputeUnitKind() const override { return "generic-64bit"; }
/// Construct the kernel for a specific image on the device.
- Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+ Expected<GenericKernelTy &>
+ constructKernelImpl(llvm::StringRef Name) override {
// Allocate and construct the kernel.
GenELF64KernelTy *GenELF64Kernel = Plugin.allocate<GenELF64KernelTy>();
if (!GenELF64Kernel)
return Plugin::error("Failed to allocate memory for GenELF64 kernel");
- new (GenELF64Kernel) GenELF64KernelTy(Name);
+ new (GenELF64Kernel) GenELF64KernelTy(Name.data());
return *GenELF64Kernel;
}
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index c5f5d902fad14c..75c3ea68eed107 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -62,6 +62,23 @@ endforeach()
target_compile_options(omptarget PRIVATE ${offload_compile_flags})
target_link_options(omptarget PRIVATE ${offload_link_flags})
+add_llvm_library(offload.kernels
+ STATIC
+
+ Kernels/Memory.cpp
+
+ LINK_LIBS
+ PUBLIC
+ omptarget.devicertl
+
+ NO_INSTALL_RPATH
+ BUILDTREE_ONLY
+)
+
+list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS)
+target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -foffload-lto -foffload-via-llvm )
+target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -foffload-lto -foffload-via-llvm )
+
# libomptarget.so needs to be aware of where the plugins live as they
# are now separated in the build directory.
set_target_properties(omptarget PROPERTIES
@@ -69,3 +86,4 @@ set_target_properties(omptarget PROPERTIES
INSTALL_RPATH "$ORIGIN"
BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
+install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
diff --git a/offload/src/Kernels/Memory.cpp b/offload/src/Kernels/Memory.cpp
new file mode 100644
index 00000000000000..94777872106b05
--- /dev/null
+++ b/offload/src/Kernels/Memory.cpp
@@ -0,0 +1,53 @@
+//===-- Kenrels/Memory.cpp - Memory related kernel definitions ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+
+#define LAUNCH_BOUNDS(MIN, MAX) \
+ __attribute__((launch_bounds(MAX), amdgpu_flat_work_group_size(MIN, MAX)))
+#define INLINE [[clang::always_inline]] inline
+#define KERNEL [[gnu::weak]] __global__
+#define DEVICE __device__
+
+extern "C" {
+DEVICE int ompx_thread_id(int Dim);
+DEVICE int ompx_block_id(int Dim);
+DEVICE int ompx_block_dim(int Dim);
+DEVICE int ompx_grid_dim(int Dim);
+}
+
+namespace {
+INLINE
+DEVICE void __memset_impl(char *Ptr, int ByteVal, size_t NumBytes) {
+ int TId = ompx_thread_id(0);
+ int BId = ompx_block_id(0);
+ int BDim = ompx_block_dim(0);
+ size_t GId = BId * BDim + TId;
+ if (GId < NumBytes)
+ Ptr[GId] = ByteVal;
+}
+} // namespace
+
+extern "C" {
+KERNEL void LAUNCH_BOUNDS(1, 256)
+ __memset(char *Ptr, int ByteVal, size_t NumBytes) {
+ __memset_impl(Ptr, ByteVal, NumBytes);
+}
+
+KERNEL void LAUNCH_BOUNDS(1, 256)
+ __memset_zero(char *Ptr, int ByteVal, size_t NumBytes) {
+ __memset_impl(Ptr, 0, NumBytes);
+}
+
+KERNEL void LAUNCH_BOUNDS(1, 256)
+ __memset_ones(char *Ptr, int ByteVal, size_t NumBytes) {
+ __memset_impl(Ptr, ~0, NumBytes);
+}
+}
diff --git a/offload/src/OpenMP/API.cpp b/offload/src/OpenMP/API.cpp
index e59bdba8abf0e4..210cadab25edee 100644
--- a/offload/src/OpenMP/API.cpp
+++ b/offload/src/OpenMP/API.cpp
@@ -392,25 +392,52 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
DP("filling memory on host via memset");
memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
} else {
- // TODO: replace the omp_target_memset() slow path with the fast path.
- // That will require the ability to execute a kernel from within
- // libomptarget.so (which we do not have at the moment).
-
- // This is a very slow path: create a filled array on the host and upload
- // it to the GPU device.
- int InitialDevice = omp_get_initial_device();
- void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
- if (Shadow) {
- (void)memset(Shadow, ByteVal, NumBytes);
- (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
- InitialDevice);
- (void)omp_target_free(Shadow, InitialDevice);
- } else {
- // If the omp_target_alloc has failed, let's just not do anything.
- // omp_target_memset does not have any good way to fail, so we
- // simply avoid a catastrophic failure of the process for now.
- DP("omp_target_memset failed to fill memory due to error with "
- "omp_target_alloc");
+ struct LaunchArgsTy {
+ void *Ptr;
+ int ByteVal;
+ size_t NumBytes;
+ } LaunchArgs{Ptr, ByteVal, NumBytes};
+
+ auto NumThreads = NumBytes > 256 ? 256 : NumBytes;
+ auto NumBlocks = (NumBytes + 255) / 256;
+ const char *KernelName = "__memset";
+ switch (ByteVal) {
+ case 0:
+ KernelName = "__memset_zero";
+ break;
+ case ~0:
+ KernelName = "__memset_ones";
+ break;
+ default:
+ break;
+ };
+ // Try to launch the __memset kernel first.
+ KernelArgsTy KernelArgs;
+ KernelArgs.NumTeams[0] = NumBlocks;
+ KernelArgs.ThreadLimit[0] = NumThreads;
+ struct {
+ size_t LaunchArgsSize;
+ void *LaunchArgs;
+ } WrappedLaunchArgs = {sizeof(LaunchArgs), &LaunchArgs};
+ KernelArgs.ArgPtrs = reinterpret_cast<void **>(&WrappedLaunchArgs);
+ KernelArgs.Flags.IsCUDA = true;
+ if (__tgt_launch_by_name(nullptr, DeviceNum, KernelName, &KernelArgs)) {
+ // This is a very slow path: create a filled array on the host and upload
+ // it to the GPU device.
+ int InitialDevice = omp_get_initial_device();
+ void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
+ if (Shadow) {
+ (void)memset(Shadow, ByteVal, NumBytes);
+ (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
+ InitialDevice);
+ (void)omp_target_free(Shadow, InitialDevice);
+ } else {
+ // If the omp_target_alloc has failed, let's just not do anything.
+ // omp_target_memset does not have any good way to fail, so we
+ // simply avoid a catastrophic failure of the process for now.
+ DP("omp_target_memset failed to fill memory due to error with "
+ "omp_target_alloc");
+ }
}
}
diff --git a/offload/src/device.cpp b/offload/src/device.cpp
index 943c7782787306..9b782009d08cd0 100644
--- a/offload/src/device.cpp
+++ b/offload/src/device.cpp
@@ -226,6 +226,17 @@ bool DeviceTy::printDeviceInfo() {
return true;
}
+int32_t DeviceTy::getKernelHandle(llvm::StringRef Name, void **HandlePtr) {
+ auto KernelOrErr = RTL->getDevice(RTLDeviceID).getKernel(Name);
+ if (!KernelOrErr) {
+ [[maybe_unused]] auto ErrStr = toString(KernelOrErr.takeError());
+ DP("%s\n", ErrStr.c_str());
+ return OFFLOAD_FAIL;
+ }
+ *HandlePtr = *KernelOrErr;
+ return OFFLOAD_SUCCESS;
+}
+
// Whether data can be copied to DstDevice directly
bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
if (RTL != DstDevice.RTL)
diff --git a/offload/src/exports b/offload/src/exports
index 7bdc7d2a531bb3..b7671dd1421bd6 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -27,6 +27,7 @@ VERS1.0 {
__tgt_target_nowait_mapper;
__tgt_target_teams_nowait_mapper;
__tgt_target_kernel;
+ __tgt_launch_by_name;
__tgt_target_kernel_nowait;
__tgt_target_nowait_query;
__tgt_target_kernel_replay;
diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp
index 21f9114ac2b088..dad643187fba26 100644
--- a/offload/src/interface.cpp
+++ b/offload/src/interface.cpp
@@ -394,6 +394,21 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
HostPtr, KernelArgs);
}
+EXTERN int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId,
+ const char *KernelName,
+ KernelArgsTy *KernelArgs) {
+ auto DeviceOrErr = PM->getDevice(DeviceId);
+ if (!DeviceOrErr)
+ FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+ auto &Device = *DeviceOrErr;
+ void *Handle;
+ if (Device.getKernelHandle(KernelName, &Handle))
+ return OFFLOAD_FAIL;
+ AsyncInfoTy AsyncInfo(*DeviceOrErr);
+ return DeviceOrErr->launchKernel(Handle, nullptr, nullptr, *KernelArgs,
+ AsyncInfo);
+}
+
/// Activates the record replay mechanism.
/// \param DeviceId The device identifier to execute the target region.
/// \param MemorySize The number of bytes to be (pre-)allocated
diff --git a/offload/test/jit/type_punning.c b/offload/test/jit/type_punning.c
index 574168b8a69cbb..c2cd415a5fc75f 100644
--- a/offload/test/jit/type_punning.c
+++ b/offload/test/jit/type_punning.c
@@ -13,8 +13,8 @@
// Ensure that there is only the kernel function left, not any outlined
// parallel regions.
//
-// CHECK: define
-// CHECK-NOT: define
+// CHECK: define {{.*}}__omp_offloading_
+// CHECK-NOT: call {{.*}}@__
#include <omp.h>
#include <stdio.h>
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index b4fc7d3b333b35..907300096f1665 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -179,7 +179,10 @@ def remove_suffix_if_present(name):
return name
def add_libraries(source):
- return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
+ source += " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
+ source += " " + config.llvm_library_intdir + "/liboffload.kernels.a"
+ return source
+
# Add platform targets
host_targets = [
diff --git a/offload/test/offloading/kernels_memset.c b/offload/test/offloading/kernels_memset.c
new file mode 100644
index 00000000000000..4cdbd56c366a88
--- /dev/null
+++ b/offload/test/offloading/kernels_memset.c
@@ -0,0 +1,61 @@
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
+// RUN: %fcheck-generic
+//
+// REQUIRES: gpu
+
+#include <omp.h>
+
+int main(int argc, char *argv[]) {
+ const int num_blocks = 64;
+ const int block_size = 256;
+ const int N = num_blocks * block_size;
+ int *data =
+ (int *)omp_target_alloc(N * sizeof(int), omp_get_default_device());
+
+ // clang-format off
+ // CHECK: Launching kernel __memset_zero with 256 blocks and 256 threads in SPMD mode
+ // CHECK: Launching kernel __omp_offloading{{.*}} with 64 blocks and 256 threads in SPMD mode
+ omp_target_memset(data, '\0', N * sizeof(int), omp_get_default_device());
+ // clang-format on
+
+#pragma omp target teams num_teams(num_blocks) thread_limit(block_size)
+ {
+#pragma omp parallel
+ if (data[omp_get_team_num() * omp_get_num_threads() +
+ omp_get_thread_num()] != 0)
+ __builtin_trap();
+ }
+
+ // clang-format off
+ // CHECK: Launching kernel __memset_ones with 256 blocks and 256 threads in SPMD mode
+ // CHECK: Launching kernel __omp_offloading{{.*}} with 64 blocks and 256 threads in SPMD mode
+ omp_target_memset(data, ~0, N * sizeof(int), omp_get_default_device());
+ // clang-format on
+
+#pragma omp target teams num_teams(num_blocks) thread_limit(block_size)
+ {
+#pragma omp parallel
+ if (data[omp_get_team_num() * omp_get_num_threads() +
+ omp_get_thread_num()] != ~0)
+ __builtin_trap();
+ }
+
+ // clang-format off
+ // CHECK: Launching kernel __memset with 256 blocks and 256 threads in SPMD mode
+ // CHECK: Launching kernel __omp_offloading{{.*}} with 256 blocks and 256 threads in SPMD mode
+ omp_target_memset(data, '$', N * sizeof(int), omp_get_default_device());
+ // clang-format on
+
+ char *cdata = (char *)data;
+#pragma omp target teams num_teams(num_blocks * sizeof(int)) \
+ thread_limit(block_size)
+ {
+#pragma omp parallel
+ if (cdata[omp_get_team_num() * omp_get_num_threads() +
+ omp_get_thread_num()] != '$')
+ __builtin_trap();
+ }
+
+ return 0;
+}
More information about the cfe-commits
mailing list