[clang] [llvm] [Offload] Provide a kernel library useable by the offload runtime (PR #104168)
Johannes Doerfert via cfe-commits
cfe-commits at lists.llvm.org
Wed Aug 14 14:38:36 PDT 2024
https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/104168
>From 9e529ede3a97513ec0afee6c4538e7299d2933f0 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 12 Aug 2024 11:53:06 -0700
Subject: [PATCH] [Offload] Provide a kernel library useable by the offload
runtime
As mentioned in #68706, it is useful to be able to call kernels from the
runtime, e.g., to perform memset. This patch provides a kernel library
that can be invoked from the offload runtime directly.
---
clang/lib/Driver/ToolChains/CommonArgs.cpp | 7 +-
offload/DeviceRTL/CMakeLists.txt | 1 +
offload/include/device.h | 3 +
offload/include/omptarget.h | 5 ++
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 5 +-
.../common/include/PluginInterface.h | 12 +++-
.../common/src/PluginInterface.cpp | 59 ++++++++++++++---
offload/plugins-nextgen/cuda/src/rtl.cpp | 5 +-
offload/plugins-nextgen/host/src/rtl.cpp | 5 +-
offload/src/CMakeLists.txt | 18 +++++
offload/src/Kernels/Memory.cpp | 53 +++++++++++++++
offload/src/OpenMP/API.cpp | 65 +++++++++++++------
offload/src/device.cpp | 11 ++++
offload/src/exports | 1 +
offload/src/interface.cpp | 15 +++++
offload/test/jit/type_punning.c | 4 +-
offload/test/lit.cfg | 5 +-
offload/test/offloading/kernels_memset.c | 61 +++++++++++++++++
18 files changed, 295 insertions(+), 40 deletions(-)
create mode 100644 offload/src/Kernels/Memory.cpp
create mode 100644 offload/test/offloading/kernels_memset.c
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 1cba3e1220264a..4080356c636dc2 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1202,8 +1202,11 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
options::OPT_fno_openmp, false)) {
// We need libomptarget (liboffload) if it's the choosen offloading runtime.
if (Args.hasFlag(options::OPT_foffload_via_llvm,
- options::OPT_fno_offload_via_llvm, false))
+ options::OPT_fno_offload_via_llvm, false)) {
CmdArgs.push_back("-lomptarget");
+ if (!Args.hasArg(options::OPT_nogpulib))
+ CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"});
+ }
return false;
}
@@ -1240,7 +1243,7 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
CmdArgs.push_back("-lomptarget");
if (IsOffloadingHost && !Args.hasArg(options::OPT_nogpulib))
- CmdArgs.push_back("-lomptarget.devicertl");
+ CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"});
addArchSpecificRPath(TC, Args, CmdArgs);
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 7818c8d752599c..e321047f781a3e 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -69,6 +69,7 @@ elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
"${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
endif()
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
+set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_DEVICE_ARCHITECTURES} PARENT_SCOPE)
set(include_files
${include_directory}/Allocator.h
diff --git a/offload/include/device.h b/offload/include/device.h
index 3132d35b7b38c8..d3415785708d62 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -124,6 +124,9 @@ struct DeviceTy {
/// Calls the corresponding print device info function in the plugin.
bool printDeviceInfo();
+ /// Return the handle to the kernel with name \p Name in \p HandlePtr.
+ int32_t getKernelHandle(llvm::StringRef Name, void **HandlePtr);
+
/// Event related interfaces.
/// {
/// Create an event.
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 2b6445e9fbe550..f4ff5d33f7bf0f 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -400,6 +400,11 @@ void __tgt_target_data_update_nowait_mapper(
int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args);
+/// Launch the kernel \p KernelName with a CUDA style launch and the given grid
+/// sizes and arguments (\p KernelArgs).
+int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId, const char *KernelName,
+ KernelArgsTy *KernelArgs);
+
// Non-blocking synchronization for target nowait regions. This function
// acquires the asynchronous context from task data of the current task being
// executed and tries to query for the completion of its operations. If the
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 604683370cd27d..5397408f21fabb 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2107,13 +2107,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
uint64_t getClockFrequency() const override { return ClockFrequency; }
/// Allocate and construct an AMDGPU kernel.
- Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+ Expected<GenericKernelTy &>
+ constructKernelImpl(llvm::StringRef Name) override {
// Allocate and construct the AMDGPU kernel.
AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
if (!AMDGPUKernel)
return Plugin::error("Failed to allocate memory for AMDGPU kernel");
- new (AMDGPUKernel) AMDGPUKernelTy(Name);
+ new (AMDGPUKernel) AMDGPUKernelTy(Name.data());
return *AMDGPUKernel;
}
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 81823338fe2112..4ace1c9972a73c 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -928,8 +928,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
bool useAutoZeroCopy();
virtual bool useAutoZeroCopyImpl() { return false; }
- /// Allocate and construct a kernel object.
- virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
+ /// Retrieve the kernel with name \p Name from image \p Image (or any image if
+ /// \p Image is null) and return it.
+ Expected<GenericKernelTy &> getKernel(llvm::StringRef Name,
+ DeviceImageTy *Image = nullptr);
/// Reference to the underlying plugin that created this device.
GenericPluginTy &Plugin;
@@ -947,6 +949,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
private:
+ /// Allocate and construct a kernel object (users should use getKernel).
+ virtual Expected<GenericKernelTy &>
+ constructKernelImpl(llvm::StringRef Name) = 0;
+
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
/// value to zero for the getters.
@@ -1046,6 +1052,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
private:
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
+
+ DenseMap<StringRef, GenericKernelTy *> KernelMap;
};
/// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 84d946507ea74a..3599ae61c7ed0d 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1533,6 +1533,55 @@ Error GenericDeviceTy::printInfo() {
return Plugin::success();
}
+Expected<GenericKernelTy &>
+GenericDeviceTy::getKernel(llvm::StringRef Name, DeviceImageTy *ImagePtr) {
+
+ GenericKernelTy *&KernelPtr = KernelMap[Name];
+ if (!KernelPtr) {
+ GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
+
+ auto CheckImage = [&](DeviceImageTy &Image) -> GenericKernelTy * {
+ if (!GHandler.isSymbolInImage(*this, Image, Name))
+ return nullptr;
+
+ auto KernelOrErr = constructKernelImpl(Name);
+ if (Error Err = KernelOrErr.takeError()) {
+ [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+ DP("Failed to construct kernel ('%s'): %s", Name.data(),
+ ErrStr.c_str());
+ return nullptr;
+ }
+
+ GenericKernelTy &Kernel = *KernelOrErr;
+ if (auto Err = Kernel.init(*this, Image)) {
+ [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+ DP("Failed to initialize kernel ('%s'): %s", Name.data(),
+ ErrStr.c_str());
+ return nullptr;
+ }
+
+ return &Kernel;
+ };
+
+ if (ImagePtr) {
+ KernelPtr = CheckImage(*ImagePtr);
+ } else {
+ for (DeviceImageTy *Image : LoadedImages) {
+ KernelPtr = CheckImage(*Image);
+ if (KernelPtr)
+ break;
+ }
+ }
+ }
+
+ if (!KernelPtr)
+ return Plugin::error("Kernel '%s' not found or could not be initialized, "
+ "searched %zu images",
+ Name.data(),
+ ImagePtr ? size_t(1) : LoadedImages.size());
+ return *KernelPtr;
+}
+
Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
return createEventImpl(EventPtrStorage);
}
@@ -2147,20 +2196,14 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
GenericDeviceTy &Device = Image.getDevice();
- auto KernelOrErr = Device.constructKernel(Name);
+ auto KernelOrErr = Device.getKernel(Name, &Image);
if (Error Err = KernelOrErr.takeError()) {
REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
return OFFLOAD_FAIL;
}
- GenericKernelTy &Kernel = *KernelOrErr;
- if (auto Err = Kernel.init(Device, Image)) {
- REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
- return OFFLOAD_FAIL;
- }
-
// Note that this is not the kernel's device address.
- *KernelPtr = &Kernel;
+ *KernelPtr = &*KernelOrErr;
return OFFLOAD_SUCCESS;
}
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b6465d61bd033f..418654aa5690e6 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -471,13 +471,14 @@ struct CUDADeviceTy : public GenericDeviceTy {
}
/// Allocate and construct a CUDA kernel.
- Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+ Expected<GenericKernelTy &>
+ constructKernelImpl(llvm::StringRef Name) override {
// Allocate and construct the CUDA kernel.
CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
if (!CUDAKernel)
return Plugin::error("Failed to allocate memory for CUDA kernel");
- new (CUDAKernel) CUDAKernelTy(Name);
+ new (CUDAKernel) CUDAKernelTy(Name.data());
return *CUDAKernel;
}
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index fe296b77c7d557..604b2648b6d629 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -151,13 +151,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
std::string getComputeUnitKind() const override { return "generic-64bit"; }
/// Construct the kernel for a specific image on the device.
- Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+ Expected<GenericKernelTy &>
+ constructKernelImpl(llvm::StringRef Name) override {
// Allocate and construct the kernel.
GenELF64KernelTy *GenELF64Kernel = Plugin.allocate<GenELF64KernelTy>();
if (!GenELF64Kernel)
return Plugin::error("Failed to allocate memory for GenELF64 kernel");
- new (GenELF64Kernel) GenELF64KernelTy(Name);
+ new (GenELF64Kernel) GenELF64KernelTy(Name.data());
return *GenELF64Kernel;
}
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index c5f5d902fad14c..75c3ea68eed107 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -62,6 +62,23 @@ endforeach()
target_compile_options(omptarget PRIVATE ${offload_compile_flags})
target_link_options(omptarget PRIVATE ${offload_link_flags})
+add_llvm_library(offload.kernels
+ STATIC
+
+ Kernels/Memory.cpp
+
+ LINK_LIBS
+ PUBLIC
+ omptarget.devicertl
+
+ NO_INSTALL_RPATH
+ BUILDTREE_ONLY
+)
+
+list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS)
+target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -foffload-lto -foffload-via-llvm )
+target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -foffload-lto -foffload-via-llvm )
+
# libomptarget.so needs to be aware of where the plugins live as they
# are now separated in the build directory.
set_target_properties(omptarget PROPERTIES
@@ -69,3 +86,4 @@ set_target_properties(omptarget PROPERTIES
INSTALL_RPATH "$ORIGIN"
BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
+install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
diff --git a/offload/src/Kernels/Memory.cpp b/offload/src/Kernels/Memory.cpp
new file mode 100644
index 00000000000000..94777872106b05
--- /dev/null
+++ b/offload/src/Kernels/Memory.cpp
@@ -0,0 +1,53 @@
+//===-- Kenrels/Memory.cpp - Memory related kernel definitions ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+
+#define LAUNCH_BOUNDS(MIN, MAX) \
+ __attribute__((launch_bounds(MAX), amdgpu_flat_work_group_size(MIN, MAX)))
+#define INLINE [[clang::always_inline]] inline
+#define KERNEL [[gnu::weak]] __global__
+#define DEVICE __device__
+
+extern "C" {
+DEVICE int ompx_thread_id(int Dim);
+DEVICE int ompx_block_id(int Dim);
+DEVICE int ompx_block_dim(int Dim);
+DEVICE int ompx_grid_dim(int Dim);
+}
+
+namespace {
+INLINE
+DEVICE void __memset_impl(char *Ptr, int ByteVal, size_t NumBytes) {
+ int TId = ompx_thread_id(0);
+ int BId = ompx_block_id(0);
+ int BDim = ompx_block_dim(0);
+ size_t GId = BId * BDim + TId;
+ if (GId < NumBytes)
+ Ptr[GId] = ByteVal;
+}
+} // namespace
+
+extern "C" {
+KERNEL void LAUNCH_BOUNDS(1, 256)
+ __memset(char *Ptr, int ByteVal, size_t NumBytes) {
+ __memset_impl(Ptr, ByteVal, NumBytes);
+}
+
+KERNEL void LAUNCH_BOUNDS(1, 256)
+ __memset_zero(char *Ptr, int ByteVal, size_t NumBytes) {
+ __memset_impl(Ptr, 0, NumBytes);
+}
+
+KERNEL void LAUNCH_BOUNDS(1, 256)
+ __memset_ones(char *Ptr, int ByteVal, size_t NumBytes) {
+ __memset_impl(Ptr, ~0, NumBytes);
+}
+}
diff --git a/offload/src/OpenMP/API.cpp b/offload/src/OpenMP/API.cpp
index e59bdba8abf0e4..210cadab25edee 100644
--- a/offload/src/OpenMP/API.cpp
+++ b/offload/src/OpenMP/API.cpp
@@ -392,25 +392,52 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
DP("filling memory on host via memset");
memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
} else {
- // TODO: replace the omp_target_memset() slow path with the fast path.
- // That will require the ability to execute a kernel from within
- // libomptarget.so (which we do not have at the moment).
-
- // This is a very slow path: create a filled array on the host and upload
- // it to the GPU device.
- int InitialDevice = omp_get_initial_device();
- void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
- if (Shadow) {
- (void)memset(Shadow, ByteVal, NumBytes);
- (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
- InitialDevice);
- (void)omp_target_free(Shadow, InitialDevice);
- } else {
- // If the omp_target_alloc has failed, let's just not do anything.
- // omp_target_memset does not have any good way to fail, so we
- // simply avoid a catastrophic failure of the process for now.
- DP("omp_target_memset failed to fill memory due to error with "
- "omp_target_alloc");
+ struct LaunchArgsTy {
+ void *Ptr;
+ int ByteVal;
+ size_t NumBytes;
+ } LaunchArgs{Ptr, ByteVal, NumBytes};
+
+ auto NumThreads = NumBytes > 256 ? 256 : NumBytes;
+ auto NumBlocks = (NumBytes + 255) / 256;
+ const char *KernelName = "__memset";
+ switch (ByteVal) {
+ case 0:
+ KernelName = "__memset_zero";
+ break;
+ case ~0:
+ KernelName = "__memset_ones";
+ break;
+ default:
+ break;
+ };
+ // Try to launch the __memset kernel first.
+ KernelArgsTy KernelArgs;
+ KernelArgs.NumTeams[0] = NumBlocks;
+ KernelArgs.ThreadLimit[0] = NumThreads;
+ struct {
+ size_t LaunchArgsSize;
+ void *LaunchArgs;
+ } WrappedLaunchArgs = {sizeof(LaunchArgs), &LaunchArgs};
+ KernelArgs.ArgPtrs = reinterpret_cast<void **>(&WrappedLaunchArgs);
+ KernelArgs.Flags.IsCUDA = true;
+ if (__tgt_launch_by_name(nullptr, DeviceNum, KernelName, &KernelArgs)) {
+ // This is a very slow path: create a filled array on the host and upload
+ // it to the GPU device.
+ int InitialDevice = omp_get_initial_device();
+ void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
+ if (Shadow) {
+ (void)memset(Shadow, ByteVal, NumBytes);
+ (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
+ InitialDevice);
+ (void)omp_target_free(Shadow, InitialDevice);
+ } else {
+ // If the omp_target_alloc has failed, let's just not do anything.
+ // omp_target_memset does not have any good way to fail, so we
+ // simply avoid a catastrophic failure of the process for now.
+ DP("omp_target_memset failed to fill memory due to error with "
+ "omp_target_alloc");
+ }
}
}
diff --git a/offload/src/device.cpp b/offload/src/device.cpp
index 943c7782787306..84660fbedaf547 100644
--- a/offload/src/device.cpp
+++ b/offload/src/device.cpp
@@ -226,6 +226,17 @@ bool DeviceTy::printDeviceInfo() {
return true;
}
+int32_t DeviceTy::getKernelHandle(llvm::StringRef Name, void **HandlePtr) {
+ auto KernelOrErr = RTL->getDevice(RTLDeviceID).getKernel(Name);
+ if (!KernelOrErr) {
+ [[maybe_unused]] auto ErrStr = toString(KernelOrErr.takeError());
+ DP("%s\n", ErrStr.c_str());
+ return OFFLOAD_FAIL;
+ }
+ *HandlePtr = &*KernelOrErr;
+ return OFFLOAD_SUCCESS;
+}
+
// Whether data can be copied to DstDevice directly
bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
if (RTL != DstDevice.RTL)
diff --git a/offload/src/exports b/offload/src/exports
index 7bdc7d2a531bb3..b7671dd1421bd6 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -27,6 +27,7 @@ VERS1.0 {
__tgt_target_nowait_mapper;
__tgt_target_teams_nowait_mapper;
__tgt_target_kernel;
+ __tgt_launch_by_name;
__tgt_target_kernel_nowait;
__tgt_target_nowait_query;
__tgt_target_kernel_replay;
diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp
index 21f9114ac2b088..dad643187fba26 100644
--- a/offload/src/interface.cpp
+++ b/offload/src/interface.cpp
@@ -394,6 +394,21 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
HostPtr, KernelArgs);
}
+EXTERN int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId,
+ const char *KernelName,
+ KernelArgsTy *KernelArgs) {
+ auto DeviceOrErr = PM->getDevice(DeviceId);
+ if (!DeviceOrErr)
+ FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+ auto &Device = *DeviceOrErr;
+ void *Handle;
+ if (Device.getKernelHandle(KernelName, &Handle))
+ return OFFLOAD_FAIL;
+ AsyncInfoTy AsyncInfo(*DeviceOrErr);
+ return DeviceOrErr->launchKernel(Handle, nullptr, nullptr, *KernelArgs,
+ AsyncInfo);
+}
+
/// Activates the record replay mechanism.
/// \param DeviceId The device identifier to execute the target region.
/// \param MemorySize The number of bytes to be (pre-)allocated
diff --git a/offload/test/jit/type_punning.c b/offload/test/jit/type_punning.c
index 574168b8a69cbb..c2cd415a5fc75f 100644
--- a/offload/test/jit/type_punning.c
+++ b/offload/test/jit/type_punning.c
@@ -13,8 +13,8 @@
// Ensure that there is only the kernel function left, not any outlined
// parallel regions.
//
-// CHECK: define
-// CHECK-NOT: define
+// CHECK: define {{.*}}__omp_offloading_
+// CHECK-NOT: call {{.*}}@__
#include <omp.h>
#include <stdio.h>
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index b4fc7d3b333b35..907300096f1665 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -179,7 +179,10 @@ def remove_suffix_if_present(name):
return name
def add_libraries(source):
- return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
+ source += " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
+ source += " " + config.llvm_library_intdir + "/liboffload.kernels.a"
+ return source
+
# Add platform targets
host_targets = [
diff --git a/offload/test/offloading/kernels_memset.c b/offload/test/offloading/kernels_memset.c
new file mode 100644
index 00000000000000..4cdbd56c366a88
--- /dev/null
+++ b/offload/test/offloading/kernels_memset.c
@@ -0,0 +1,61 @@
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
+// RUN: %fcheck-generic
+//
+// REQUIRES: gpu
+
+#include <omp.h>
+
+int main(int argc, char *argv[]) {
+ const int num_blocks = 64;
+ const int block_size = 256;
+ const int N = num_blocks * block_size;
+ int *data =
+ (int *)omp_target_alloc(N * sizeof(int), omp_get_default_device());
+
+ // clang-format off
+ // CHECK: Launching kernel __memset_zero with 256 blocks and 256 threads in SPMD mode
+ // CHECK: Launching kernel __omp_offloading{{.*}} with 64 blocks and 256 threads in SPMD mode
+ omp_target_memset(data, '\0', N * sizeof(int), omp_get_default_device());
+ // clang-format on
+
+#pragma omp target teams num_teams(num_blocks) thread_limit(block_size)
+ {
+#pragma omp parallel
+ if (data[omp_get_team_num() * omp_get_num_threads() +
+ omp_get_thread_num()] != 0)
+ __builtin_trap();
+ }
+
+ // clang-format off
+ // CHECK: Launching kernel __memset_ones with 256 blocks and 256 threads in SPMD mode
+ // CHECK: Launching kernel __omp_offloading{{.*}} with 64 blocks and 256 threads in SPMD mode
+ omp_target_memset(data, ~0, N * sizeof(int), omp_get_default_device());
+ // clang-format on
+
+#pragma omp target teams num_teams(num_blocks) thread_limit(block_size)
+ {
+#pragma omp parallel
+ if (data[omp_get_team_num() * omp_get_num_threads() +
+ omp_get_thread_num()] != ~0)
+ __builtin_trap();
+ }
+
+ // clang-format off
+ // CHECK: Launching kernel __memset with 256 blocks and 256 threads in SPMD mode
+ // CHECK: Launching kernel __omp_offloading{{.*}} with 256 blocks and 256 threads in SPMD mode
+ omp_target_memset(data, '$', N * sizeof(int), omp_get_default_device());
+ // clang-format on
+
+ char *cdata = (char *)data;
+#pragma omp target teams num_teams(num_blocks * sizeof(int)) \
+ thread_limit(block_size)
+ {
+#pragma omp parallel
+ if (cdata[omp_get_team_num() * omp_get_num_threads() +
+ omp_get_thread_num()] != '$')
+ __builtin_trap();
+ }
+
+ return 0;
+}
More information about the cfe-commits
mailing list