[clang] [llvm] [Offload] Provide a kernel library useable by the offload runtime (PR #104168)

Johannes Doerfert via cfe-commits cfe-commits at lists.llvm.org
Wed Aug 14 11:57:05 PDT 2024


https://github.com/jdoerfert created https://github.com/llvm/llvm-project/pull/104168

As mentioned in #68706, it is useful to be able to call kernels from the runtime, e.g., to perform memset. This patch provides a kernel library that can be invoked from the offload runtime directly and implements memset with it. Note that these kernels are automatically linked into an application that has device code.

>From defcf1effca6b441fc687477f406295601806163 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 12 Aug 2024 11:53:06 -0700
Subject: [PATCH] [Offload] Provide a kernel library useable by the offload
 runtime

As mentioned in #68706, it is useful to be able to call kernels from the
runtime, e.g., to perform memset. This patch provides a kernel library
that can be invoked from the offload runtime directly.
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  9 ++-
 offload/DeviceRTL/CMakeLists.txt              |  1 +
 offload/include/device.h                      |  3 +
 offload/include/omptarget.h                   |  5 ++
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |  5 +-
 .../common/include/PluginInterface.h          | 12 +++-
 .../common/src/PluginInterface.cpp            | 58 +++++++++++++++--
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  5 +-
 offload/plugins-nextgen/host/src/rtl.cpp      |  5 +-
 offload/src/CMakeLists.txt                    | 18 +++++
 offload/src/Kernels/Memory.cpp                | 53 +++++++++++++++
 offload/src/OpenMP/API.cpp                    | 65 +++++++++++++------
 offload/src/device.cpp                        | 11 ++++
 offload/src/exports                           |  1 +
 offload/src/interface.cpp                     | 15 +++++
 offload/test/jit/type_punning.c               |  4 +-
 offload/test/lit.cfg                          |  5 +-
 offload/test/offloading/kernels_memset.c      | 61 +++++++++++++++++
 18 files changed, 296 insertions(+), 40 deletions(-)
 create mode 100644 offload/src/Kernels/Memory.cpp
 create mode 100644 offload/test/offloading/kernels_memset.c

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 1cba3e1220264a..071100a73cab88 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1202,8 +1202,11 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
                     options::OPT_fno_openmp, false)) {
     // We need libomptarget (liboffload) if it's the choosen offloading runtime.
     if (Args.hasFlag(options::OPT_foffload_via_llvm,
-                     options::OPT_fno_offload_via_llvm, false))
+                     options::OPT_fno_offload_via_llvm, false)) {
       CmdArgs.push_back("-lomptarget");
+      if (!Args.hasArg(options::OPT_nogpulib))
+        CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"});
+    }
     return false;
   }
 
@@ -1237,10 +1240,10 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
       CmdArgs.push_back("-lrt");
 
   if (IsOffloadingHost)
-    CmdArgs.push_back("-lomptarget");
+      CmdArgs.push_back("-lomptarget");
 
   if (IsOffloadingHost && !Args.hasArg(options::OPT_nogpulib))
-    CmdArgs.push_back("-lomptarget.devicertl");
+      CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"});
 
   addArchSpecificRPath(TC, Args, CmdArgs);
 
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 7818c8d752599c..e321047f781a3e 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -69,6 +69,7 @@ elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
       "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
 endif()
 list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
+set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_DEVICE_ARCHITECTURES} PARENT_SCOPE)
 
 set(include_files
   ${include_directory}/Allocator.h
diff --git a/offload/include/device.h b/offload/include/device.h
index 3132d35b7b38c8..d3415785708d62 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -124,6 +124,9 @@ struct DeviceTy {
   /// Calls the corresponding print device info function in the plugin.
   bool printDeviceInfo();
 
+  /// Return the handle to the kernel with name \p Name in \p HandlePtr.
+  int32_t getKernelHandle(llvm::StringRef Name, void **HandlePtr);
+
   /// Event related interfaces.
   /// {
   /// Create an event.
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 2b6445e9fbe550..f4ff5d33f7bf0f 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -400,6 +400,11 @@ void __tgt_target_data_update_nowait_mapper(
 int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                         int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args);
 
+/// Launch the kernel \p KernelName with a CUDA style launch and the given grid
+/// sizes and arguments (\p KernelArgs).
+int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId, const char *KernelName,
+		      KernelArgsTy *KernelArgs);
+
 // Non-blocking synchronization for target nowait regions. This function
 // acquires the asynchronous context from task data of the current task being
 // executed and tries to query for the completion of its operations. If the
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 604683370cd27d..5397408f21fabb 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2107,13 +2107,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   uint64_t getClockFrequency() const override { return ClockFrequency; }
 
   /// Allocate and construct an AMDGPU kernel.
-  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+  Expected<GenericKernelTy &>
+  constructKernelImpl(llvm::StringRef Name) override {
     // Allocate and construct the AMDGPU kernel.
     AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
     if (!AMDGPUKernel)
       return Plugin::error("Failed to allocate memory for AMDGPU kernel");
 
-    new (AMDGPUKernel) AMDGPUKernelTy(Name);
+    new (AMDGPUKernel) AMDGPUKernelTy(Name.data());
 
     return *AMDGPUKernel;
   }
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 81823338fe2112..4ace1c9972a73c 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -928,8 +928,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   bool useAutoZeroCopy();
   virtual bool useAutoZeroCopyImpl() { return false; }
 
-  /// Allocate and construct a kernel object.
-  virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
+  /// Retrieve the kernel with name \p Name from image \p Image (or any image if
+  /// \p Image is null) and return it.
+  Expected<GenericKernelTy &> getKernel(llvm::StringRef Name,
+                                        DeviceImageTy *Image = nullptr);
 
   /// Reference to the underlying plugin that created this device.
   GenericPluginTy &Plugin;
@@ -947,6 +949,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
       UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
 
 private:
+  /// Allocate and construct a kernel object (users should use getKernel).
+  virtual Expected<GenericKernelTy &>
+  constructKernelImpl(llvm::StringRef Name) = 0;
+
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
   /// value to zero for the getters.
@@ -1046,6 +1052,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 private:
   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
+
+  DenseMap<StringRef, GenericKernelTy *> KernelMap;
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 84d946507ea74a..e83477b5ddc59f 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1533,6 +1533,55 @@ Error GenericDeviceTy::printInfo() {
   return Plugin::success();
 }
 
+Expected<GenericKernelTy &>
+GenericDeviceTy::getKernel(llvm::StringRef Name, DeviceImageTy *ImagePtr) {
+
+  GenericKernelTy *&KernelPtr = KernelMap[Name];
+  if (!KernelPtr) {
+    GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
+
+    auto CheckImage = [&](DeviceImageTy &Image) -> GenericKernelTy * {
+      if (!GHandler.isSymbolInImage(*this, Image, Name))
+        return nullptr;
+
+      auto KernelOrErr = constructKernelImpl(Name);
+      if (Error Err = KernelOrErr.takeError()) {
+        [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+        DP("Failed to construct kernel ('%s'): %s", Name.data(),
+           ErrStr.c_str());
+        return nullptr;
+      }
+
+      GenericKernelTy &Kernel = *KernelOrErr;
+      if (auto Err = Kernel.init(*this, Image)) {
+        [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+        DP("Failed to initialize kernel ('%s'): %s", Name.data(),
+           ErrStr.c_str());
+        return nullptr;
+      }
+
+      return &Kernel;
+    };
+
+    if (ImagePtr) {
+      KernelPtr = CheckImage(*ImagePtr);
+    } else {
+      for (DeviceImageTy *Image : LoadedImages) {
+        KernelPtr = CheckImage(*Image);
+        if (KernelPtr)
+          break;
+      }
+    }
+  }
+
+  if (!KernelPtr)
+    return Plugin::error("Kernel '%s' not found or could not be initialized, "
+                         "searched %zu images",
+                         Name.data(),
+                         ImagePtr ? size_t(1) : LoadedImages.size());
+  return *KernelPtr;
+}
+
 Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
   return createEventImpl(EventPtrStorage);
 }
@@ -2147,20 +2196,15 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
 
   GenericDeviceTy &Device = Image.getDevice();
 
-  auto KernelOrErr = Device.constructKernel(Name);
+  auto KernelOrErr = Device.getKernel(Name, &Image);
   if (Error Err = KernelOrErr.takeError()) {
     REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
     return OFFLOAD_FAIL;
   }
 
-  GenericKernelTy &Kernel = *KernelOrErr;
-  if (auto Err = Kernel.init(Device, Image)) {
-    REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
 
   // Note that this is not the kernel's device address.
-  *KernelPtr = &Kernel;
+  *KernelPtr = &*KernelOrErr;
   return OFFLOAD_SUCCESS;
 }
 
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b6465d61bd033f..418654aa5690e6 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -471,13 +471,14 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate and construct a CUDA kernel.
-  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+  Expected<GenericKernelTy &>
+  constructKernelImpl(llvm::StringRef Name) override {
     // Allocate and construct the CUDA kernel.
     CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
     if (!CUDAKernel)
       return Plugin::error("Failed to allocate memory for CUDA kernel");
 
-    new (CUDAKernel) CUDAKernelTy(Name);
+    new (CUDAKernel) CUDAKernelTy(Name.data());
 
     return *CUDAKernel;
   }
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index fe296b77c7d557..604b2648b6d629 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -151,13 +151,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   std::string getComputeUnitKind() const override { return "generic-64bit"; }
 
   /// Construct the kernel for a specific image on the device.
-  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+  Expected<GenericKernelTy &>
+  constructKernelImpl(llvm::StringRef Name) override {
     // Allocate and construct the kernel.
     GenELF64KernelTy *GenELF64Kernel = Plugin.allocate<GenELF64KernelTy>();
     if (!GenELF64Kernel)
       return Plugin::error("Failed to allocate memory for GenELF64 kernel");
 
-    new (GenELF64Kernel) GenELF64KernelTy(Name);
+    new (GenELF64Kernel) GenELF64KernelTy(Name.data());
 
     return *GenELF64Kernel;
   }
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index c5f5d902fad14c..75c3ea68eed107 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -62,6 +62,23 @@ endforeach()
 target_compile_options(omptarget PRIVATE ${offload_compile_flags})
 target_link_options(omptarget PRIVATE ${offload_link_flags})
 
+add_llvm_library(offload.kernels
+  STATIC
+
+  Kernels/Memory.cpp
+
+  LINK_LIBS
+  PUBLIC
+  omptarget.devicertl
+
+  NO_INSTALL_RPATH
+  BUILDTREE_ONLY
+)
+
+list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS)
+target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -foffload-lto -foffload-via-llvm )
+target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -foffload-lto -foffload-via-llvm )
+
 # libomptarget.so needs to be aware of where the plugins live as they
 # are now separated in the build directory.
 set_target_properties(omptarget PROPERTIES
@@ -69,3 +86,4 @@ set_target_properties(omptarget PROPERTIES
                       INSTALL_RPATH "$ORIGIN"
                       BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
 install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
+install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
diff --git a/offload/src/Kernels/Memory.cpp b/offload/src/Kernels/Memory.cpp
new file mode 100644
index 00000000000000..94777872106b05
--- /dev/null
+++ b/offload/src/Kernels/Memory.cpp
@@ -0,0 +1,53 @@
+//===-- Kenrels/Memory.cpp - Memory related kernel definitions ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+
+#define LAUNCH_BOUNDS(MIN, MAX)                                                \
+  __attribute__((launch_bounds(MAX), amdgpu_flat_work_group_size(MIN, MAX)))
+#define INLINE [[clang::always_inline]] inline
+#define KERNEL [[gnu::weak]] __global__
+#define DEVICE __device__
+
+extern "C" {
+DEVICE int ompx_thread_id(int Dim);
+DEVICE int ompx_block_id(int Dim);
+DEVICE int ompx_block_dim(int Dim);
+DEVICE int ompx_grid_dim(int Dim);
+}
+
+namespace {
+INLINE
+DEVICE void __memset_impl(char *Ptr, int ByteVal, size_t NumBytes) {
+  int TId = ompx_thread_id(0);
+  int BId = ompx_block_id(0);
+  int BDim = ompx_block_dim(0);
+  size_t GId = BId * BDim + TId;
+  if (GId < NumBytes)
+    Ptr[GId] = ByteVal;
+}
+} // namespace
+
+extern "C" {
+KERNEL void LAUNCH_BOUNDS(1, 256)
+    __memset(char *Ptr, int ByteVal, size_t NumBytes) {
+  __memset_impl(Ptr, ByteVal, NumBytes);
+}
+
+KERNEL void LAUNCH_BOUNDS(1, 256)
+    __memset_zero(char *Ptr, int ByteVal, size_t NumBytes) {
+  __memset_impl(Ptr, 0, NumBytes);
+}
+
+KERNEL void LAUNCH_BOUNDS(1, 256)
+    __memset_ones(char *Ptr, int ByteVal, size_t NumBytes) {
+  __memset_impl(Ptr, ~0, NumBytes);
+}
+}
diff --git a/offload/src/OpenMP/API.cpp b/offload/src/OpenMP/API.cpp
index e59bdba8abf0e4..210cadab25edee 100644
--- a/offload/src/OpenMP/API.cpp
+++ b/offload/src/OpenMP/API.cpp
@@ -392,25 +392,52 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
     DP("filling memory on host via memset");
     memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
   } else {
-    // TODO: replace the omp_target_memset() slow path with the fast path.
-    // That will require the ability to execute a kernel from within
-    // libomptarget.so (which we do not have at the moment).
-
-    // This is a very slow path: create a filled array on the host and upload
-    // it to the GPU device.
-    int InitialDevice = omp_get_initial_device();
-    void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
-    if (Shadow) {
-      (void)memset(Shadow, ByteVal, NumBytes);
-      (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
-                              InitialDevice);
-      (void)omp_target_free(Shadow, InitialDevice);
-    } else {
-      // If the omp_target_alloc has failed, let's just not do anything.
-      // omp_target_memset does not have any good way to fail, so we
-      // simply avoid a catastrophic failure of the process for now.
-      DP("omp_target_memset failed to fill memory due to error with "
-         "omp_target_alloc");
+    struct LaunchArgsTy {
+      void *Ptr;
+      int ByteVal;
+      size_t NumBytes;
+    } LaunchArgs{Ptr, ByteVal, NumBytes};
+
+    auto NumThreads = NumBytes > 256 ? 256 : NumBytes;
+    auto NumBlocks = (NumBytes + 255) / 256;
+    const char *KernelName = "__memset";
+    switch (ByteVal) {
+    case 0:
+      KernelName = "__memset_zero";
+      break;
+    case ~0:
+      KernelName = "__memset_ones";
+      break;
+    default:
+      break;
+    };
+    // Try to launch the __memset kernel first.
+    KernelArgsTy KernelArgs;
+    KernelArgs.NumTeams[0] = NumBlocks;
+    KernelArgs.ThreadLimit[0] = NumThreads;
+    struct {
+      size_t LaunchArgsSize;
+      void *LaunchArgs;
+    } WrappedLaunchArgs = {sizeof(LaunchArgs), &LaunchArgs};
+    KernelArgs.ArgPtrs = reinterpret_cast<void **>(&WrappedLaunchArgs);
+    KernelArgs.Flags.IsCUDA = true;
+    if (__tgt_launch_by_name(nullptr, DeviceNum, KernelName, &KernelArgs)) {
+      // This is a very slow path: create a filled array on the host and upload
+      // it to the GPU device.
+      int InitialDevice = omp_get_initial_device();
+      void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
+      if (Shadow) {
+        (void)memset(Shadow, ByteVal, NumBytes);
+        (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
+                                InitialDevice);
+        (void)omp_target_free(Shadow, InitialDevice);
+      } else {
+        // If the omp_target_alloc has failed, let's just not do anything.
+        // omp_target_memset does not have any good way to fail, so we
+        // simply avoid a catastrophic failure of the process for now.
+        DP("omp_target_memset failed to fill memory due to error with "
+           "omp_target_alloc");
+      }
     }
   }
 
diff --git a/offload/src/device.cpp b/offload/src/device.cpp
index 943c7782787306..84660fbedaf547 100644
--- a/offload/src/device.cpp
+++ b/offload/src/device.cpp
@@ -226,6 +226,17 @@ bool DeviceTy::printDeviceInfo() {
   return true;
 }
 
+int32_t DeviceTy::getKernelHandle(llvm::StringRef Name, void **HandlePtr) {
+  auto KernelOrErr = RTL->getDevice(RTLDeviceID).getKernel(Name);
+  if (!KernelOrErr) {
+    [[maybe_unused]] auto ErrStr = toString(KernelOrErr.takeError());
+    DP("%s\n", ErrStr.c_str());
+    return OFFLOAD_FAIL;
+  }
+  *HandlePtr = &*KernelOrErr;
+  return OFFLOAD_SUCCESS;
+}
+
 // Whether data can be copied to DstDevice directly
 bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
   if (RTL != DstDevice.RTL)
diff --git a/offload/src/exports b/offload/src/exports
index 7bdc7d2a531bb3..b7671dd1421bd6 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -27,6 +27,7 @@ VERS1.0 {
     __tgt_target_nowait_mapper;
     __tgt_target_teams_nowait_mapper;
     __tgt_target_kernel;
+    __tgt_launch_by_name;
     __tgt_target_kernel_nowait;
     __tgt_target_nowait_query;
     __tgt_target_kernel_replay;
diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp
index 21f9114ac2b088..dad643187fba26 100644
--- a/offload/src/interface.cpp
+++ b/offload/src/interface.cpp
@@ -394,6 +394,21 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                    HostPtr, KernelArgs);
 }
 
+EXTERN int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId,
+                                const char *KernelName,
+                                KernelArgsTy *KernelArgs) {
+  auto DeviceOrErr = PM->getDevice(DeviceId);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+  auto &Device = *DeviceOrErr;
+  void *Handle;
+  if (Device.getKernelHandle(KernelName, &Handle))
+    return OFFLOAD_FAIL;
+  AsyncInfoTy AsyncInfo(*DeviceOrErr);
+  return DeviceOrErr->launchKernel(Handle, nullptr, nullptr, *KernelArgs,
+                                   AsyncInfo);
+}
+
 /// Activates the record replay mechanism.
 /// \param DeviceId The device identifier to execute the target region.
 /// \param MemorySize The number of bytes to be (pre-)allocated
diff --git a/offload/test/jit/type_punning.c b/offload/test/jit/type_punning.c
index 574168b8a69cbb..c2cd415a5fc75f 100644
--- a/offload/test/jit/type_punning.c
+++ b/offload/test/jit/type_punning.c
@@ -13,8 +13,8 @@
 // Ensure that there is only the kernel function left, not any outlined
 // parallel regions.
 //
-// CHECK: define
-// CHECK-NOT: define
+// CHECK:     define {{.*}}__omp_offloading_
+// CHECK-NOT: call {{.*}}@__
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index b4fc7d3b333b35..907300096f1665 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -179,7 +179,10 @@ def remove_suffix_if_present(name):
         return name
 
 def add_libraries(source):
-    return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
+    source += " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
+    source += " " + config.llvm_library_intdir + "/liboffload.kernels.a"
+    return source
+
 
 # Add platform targets
 host_targets = [
diff --git a/offload/test/offloading/kernels_memset.c b/offload/test/offloading/kernels_memset.c
new file mode 100644
index 00000000000000..4cdbd56c366a88
--- /dev/null
+++ b/offload/test/offloading/kernels_memset.c
@@ -0,0 +1,61 @@
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
+// RUN:   %fcheck-generic
+//
+// REQUIRES: gpu
+
+#include <omp.h>
+
+int main(int argc, char *argv[]) {
+  const int num_blocks = 64;
+  const int block_size = 256;
+  const int N = num_blocks * block_size;
+  int *data =
+      (int *)omp_target_alloc(N * sizeof(int), omp_get_default_device());
+
+  // clang-format off
+  // CHECK: Launching kernel __memset_zero with 256 blocks and 256 threads in SPMD mode
+  // CHECK: Launching kernel __omp_offloading{{.*}} with 64 blocks and 256 threads in SPMD mode
+  omp_target_memset(data, '\0', N * sizeof(int), omp_get_default_device());
+  // clang-format on
+
+#pragma omp target teams num_teams(num_blocks) thread_limit(block_size)
+  {
+#pragma omp parallel
+    if (data[omp_get_team_num() * omp_get_num_threads() +
+             omp_get_thread_num()] != 0)
+      __builtin_trap();
+  }
+
+  // clang-format off
+  // CHECK: Launching kernel __memset_ones with 256 blocks and 256 threads in SPMD mode
+  // CHECK: Launching kernel __omp_offloading{{.*}} with 64 blocks and 256 threads in SPMD mode
+  omp_target_memset(data, ~0, N * sizeof(int), omp_get_default_device());
+  // clang-format on
+
+#pragma omp target teams num_teams(num_blocks) thread_limit(block_size)
+  {
+#pragma omp parallel
+    if (data[omp_get_team_num() * omp_get_num_threads() +
+             omp_get_thread_num()] != ~0)
+      __builtin_trap();
+  }
+
+  // clang-format off
+  // CHECK: Launching kernel __memset with 256 blocks and 256 threads in SPMD mode
+  // CHECK: Launching kernel __omp_offloading{{.*}} with 256 blocks and 256 threads in SPMD mode
+  omp_target_memset(data, '$', N * sizeof(int), omp_get_default_device());
+  // clang-format on
+
+  char *cdata = (char *)data;
+#pragma omp target teams num_teams(num_blocks * sizeof(int))                   \
+    thread_limit(block_size)
+  {
+#pragma omp parallel
+    if (cdata[omp_get_team_num() * omp_get_num_threads() +
+              omp_get_thread_num()] != '$')
+      __builtin_trap();
+  }
+
+  return 0;
+}



More information about the cfe-commits mailing list