[clang] [llvm] [Offload] Provide a kernel library useable by the offload runtime (PR #104168)

Wed Aug 14 16:40:58 PDT 2024

https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/104168

>From bda519493667d3554365061475308da3786bfa9d Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 12 Aug 2024 11:53:06 -0700
Subject: [PATCH] [Offload] Provide a kernel library useable by the offload
 runtime

As mentioned in #68706, it is useful to be able to call kernels from the
runtime, e.g., to perform memset. This patch provides a kernel library
that can be invoked from the offload runtime directly.
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |   7 +-
 offload/DeviceRTL/CMakeLists.txt              |   1 +
 offload/include/device.h                      |   3 +
 offload/include/omptarget.h                   |   5 +
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |  56 ++-------
 .../common/include/PluginInterface.h          |  33 ++++--
 .../common/src/PluginInterface.cpp            | 107 ++++++++++++++++--
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  33 +++---
 offload/plugins-nextgen/host/src/rtl.cpp      |   5 +-
 offload/src/CMakeLists.txt                    |  18 +++
 offload/src/Kernels/Memory.cpp                |  53 +++++++++
 offload/src/OpenMP/API.cpp                    |  65 +++++++----
 offload/src/device.cpp                        |  11 ++
 offload/src/exports                           |   1 +
 offload/src/interface.cpp                     |  15 +++
 offload/test/jit/type_punning.c               |   4 +-
 offload/test/lit.cfg                          |   5 +-
 offload/test/offloading/kernels_memset.c      |  61 ++++++++++
 18 files changed, 368 insertions(+), 115 deletions(-)
 create mode 100644 offload/src/Kernels/Memory.cpp
 create mode 100644 offload/test/offloading/kernels_memset.c

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 1cba3e1220264a..4080356c636dc2 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1202,8 +1202,11 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
                     options::OPT_fno_openmp, false)) {
     // We need libomptarget (liboffload) if it's the choosen offloading runtime.
     if (Args.hasFlag(options::OPT_foffload_via_llvm,
-                     options::OPT_fno_offload_via_llvm, false))
+                     options::OPT_fno_offload_via_llvm, false)) {
       CmdArgs.push_back("-lomptarget");
+      if (!Args.hasArg(options::OPT_nogpulib))
+        CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"});
+    }
     return false;
   }
 
@@ -1240,7 +1243,7 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
     CmdArgs.push_back("-lomptarget");
 
   if (IsOffloadingHost && !Args.hasArg(options::OPT_nogpulib))
-    CmdArgs.push_back("-lomptarget.devicertl");
+    CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"});
 
   addArchSpecificRPath(TC, Args, CmdArgs);
 
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 7818c8d752599c..e321047f781a3e 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -69,6 +69,7 @@ elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
       "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
 endif()
 list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
+set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_DEVICE_ARCHITECTURES} PARENT_SCOPE)
 
 set(include_files
   ${include_directory}/Allocator.h
diff --git a/offload/include/device.h b/offload/include/device.h
index 3132d35b7b38c8..d3415785708d62 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -124,6 +124,9 @@ struct DeviceTy {
   /// Calls the corresponding print device info function in the plugin.
   bool printDeviceInfo();
 
+  /// Return the handle to the kernel with name \p Name in \p HandlePtr.
+  int32_t getKernelHandle(llvm::StringRef Name, void **HandlePtr);
+
   /// Event related interfaces.
   /// {
   /// Create an event.
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 2b6445e9fbe550..f4ff5d33f7bf0f 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -400,6 +400,11 @@ void __tgt_target_data_update_nowait_mapper(
 int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                         int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args);
 
+/// Launch the kernel \p KernelName with a CUDA style launch and the given grid
+/// sizes and arguments (\p KernelArgs).
+int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId, const char *KernelName,
+		      KernelArgsTy *KernelArgs);
+
 // Non-blocking synchronization for target nowait regions. This function
 // acquires the asynchronous context from task data of the current task being
 // executed and tries to query for the completion of its operations. If the
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 604683370cd27d..5d135795170563 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2016,20 +2016,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
-  virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
-                                       DeviceImageTy &Image) override {
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
-      Image.setPendingGlobalDtors();
-
-    return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
+  virtual Expected<StringRef>
+  getGlobalConstructorName(DeviceImageTy &Image) override {
+    return "amdgcn.device.init";
   }
-
-  virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
-                                      DeviceImageTy &Image) override {
-    if (Image.hasPendingGlobalDtors())
-      return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
-    return Plugin::success();
+  virtual Expected<StringRef>
+  getGlobalDestructorName(DeviceImageTy &Image) override {
+    return "amdgcn.device.fini";
   }
 
   uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
@@ -2107,13 +2100,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   uint64_t getClockFrequency() const override { return ClockFrequency; }
 
   /// Allocate and construct an AMDGPU kernel.
-  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+  Expected<GenericKernelTy &>
+  constructKernelImpl(llvm::StringRef Name) override {
     // Allocate and construct the AMDGPU kernel.
     AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
     if (!AMDGPUKernel)
       return Plugin::error("Failed to allocate memory for AMDGPU kernel");
 
-    new (AMDGPUKernel) AMDGPUKernelTy(Name);
+    new (AMDGPUKernel) AMDGPUKernelTy(Name.data());
 
     return *AMDGPUKernel;
   }
@@ -2791,38 +2785,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
   using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
 
-  /// Common method to invoke a single threaded constructor or destructor
-  /// kernel by name.
-  Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
-                                 bool IsCtor) {
-    const char *KernelName =
-        IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
-    // Perform a quick check for the named kernel in the image. The kernel
-    // should be created by the 'amdgpu-lower-ctor-dtor' pass.
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
-      return Plugin::success();
-
-    // Allocate and construct the AMDGPU kernel.
-    AMDGPUKernelTy AMDGPUKernel(KernelName);
-    if (auto Err = AMDGPUKernel.init(*this, Image))
-      return Err;
-
-    AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
-
-    KernelArgsTy KernelArgs = {};
-    if (auto Err =
-            AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
-                                    /*NumBlocks=*/1ul, KernelArgs,
-                                    KernelLaunchParamsTy{}, AsyncInfoWrapper))
-      return Err;
-
-    Error Err = Plugin::success();
-    AsyncInfoWrapper.finalize(Err);
-
-    return Err;
-  }
-
   /// Detect if current architecture is an APU.
   Error checkIfAPU() {
     // TODO: replace with ROCr API once it becomes available.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 81823338fe2112..08bdb0d75a464f 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -722,18 +722,17 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error synchronize(__tgt_async_info *AsyncInfo);
   virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
 
-  /// Invokes any global constructors on the device if present and is required
-  /// by the target.
-  virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
-                                       DeviceImageTy &Image) {
-    return Error::success();
+  /// Call the ctor/dtor of image \p Image, if available.
+  Error callGlobalCtorDtor(DeviceImageTy &Image, bool IsCtor);
+
+  /// Return the name of the global constructors on the device.
+  virtual Expected<StringRef> getGlobalConstructorName(DeviceImageTy &Image) {
+    return "";
   }
 
-  /// Invokes any global destructors on the device if present and is required
-  /// by the target.
-  virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
-                                      DeviceImageTy &Image) {
-    return Error::success();
+  /// Return the name of the global destructors on the device.
+  virtual Expected<StringRef> getGlobalDestructorName(DeviceImageTy &Image) {
+    return "";
   }
 
   /// Query for the completion of the pending operations on the __tgt_async_info
@@ -928,8 +927,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   bool useAutoZeroCopy();
   virtual bool useAutoZeroCopyImpl() { return false; }
 
-  /// Allocate and construct a kernel object.
-  virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
+  /// Retrieve the kernel with name \p Name from image \p Image (or any image if
+  /// \p Image is null) and return it. If \p Optional is true, the function
+  /// returns success if there is no kernel with the given name.
+  Expected<GenericKernelTy *> getKernel(llvm::StringRef Name,
+                                        DeviceImageTy *Image = nullptr,
+                                        bool Optional = false);
 
   /// Reference to the underlying plugin that created this device.
   GenericPluginTy &Plugin;
@@ -947,6 +950,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
       UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
 
 private:
+  /// Allocate and construct a kernel object (users should use getKernel).
+  virtual Expected<GenericKernelTy &>
+  constructKernelImpl(llvm::StringRef Name) = 0;
+
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
   /// value to zero for the getters.
@@ -1046,6 +1053,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 private:
   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
+
+  DenseMap<StringRef, GenericKernelTy *> KernelMap;
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 84d946507ea74a..ac7709f4229bc3 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -809,7 +809,7 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
 
 Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
   for (DeviceImageTy *Image : LoadedImages)
-    if (auto Err = callGlobalDestructors(Plugin, *Image))
+    if (auto Err = callGlobalCtorDtor(*Image, /*Ctor*/ false))
       return Err;
 
   if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
@@ -866,6 +866,37 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
 
   return deinitImpl();
 }
+
+Error GenericDeviceTy::callGlobalCtorDtor(DeviceImageTy &Image, bool IsCtor) {
+  auto NameOrErr =
+      IsCtor ? getGlobalConstructorName(Image) : getGlobalDestructorName(Image);
+  if (auto Err = NameOrErr.takeError())
+    return Err;
+  // No error but no name, that means there is no ctor/dtor.
+  if (NameOrErr->empty())
+    return Plugin::success();
+
+  auto KernelOrErr = getKernel(*NameOrErr, &Image, /*Optional=*/true);
+  if (auto Err = KernelOrErr.takeError())
+    return Err;
+
+  if (GenericKernelTy *Kernel = *KernelOrErr) {
+    KernelArgsTy KernelArgs;
+    KernelArgs.NumTeams[0] = KernelArgs.ThreadLimit[0] = 1;
+    AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
+    if (auto Err = Kernel->launch(*this, /*ArgPtrs=*/nullptr,
+                                  /*ArgOffsets=*/nullptr, KernelArgs,
+                                  AsyncInfoWrapper))
+      return Err;
+
+    Error Err = Plugin::success();
+    AsyncInfoWrapper.finalize(Err);
+    return Err;
+  }
+
+  return Plugin::success();
+}
+
 Expected<DeviceImageTy *>
 GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
                             const __tgt_device_image *InputTgtImage) {
@@ -927,8 +958,8 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
 #endif
 
   // Call any global constructors present on the device.
-  if (auto Err = callGlobalConstructors(Plugin, *Image))
-    return std::move(Err);
+  if (auto Err = callGlobalCtorDtor(*Image, /*Ctor*/ true))
+    return Err;
 
   // Return the pointer to the table of entries.
   return Image;
@@ -1533,6 +1564,66 @@ Error GenericDeviceTy::printInfo() {
   return Plugin::success();
 }
 
+Expected<GenericKernelTy *> GenericDeviceTy::getKernel(llvm::StringRef Name,
+                                                       DeviceImageTy *ImagePtr,
+                                                       bool Optional) {
+  bool KernelFound = false;
+  GenericKernelTy *&KernelPtr = KernelMap[Name];
+  if (!KernelPtr) {
+    GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
+
+    auto CheckImage = [&](DeviceImageTy &Image) -> GenericKernelTy * {
+      if (!GHandler.isSymbolInImage(*this, Image, Name))
+        return nullptr;
+
+      auto KernelOrErr = constructKernelImpl(Name);
+      if (Error Err = KernelOrErr.takeError()) {
+        [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+        DP("Failed to construct kernel ('%s'): %s", Name.data(),
+           ErrStr.c_str());
+        return nullptr;
+      }
+
+      GenericKernelTy &Kernel = *KernelOrErr;
+      if (auto Err = Kernel.init(*this, Image)) {
+        [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+        DP("Failed to initialize kernel ('%s'): %s", Name.data(),
+           ErrStr.c_str());
+        return nullptr;
+      }
+
+      return &Kernel;
+    };
+
+    if (ImagePtr) {
+      KernelPtr = CheckImage(*ImagePtr);
+    } else {
+      for (DeviceImageTy *Image : LoadedImages) {
+        KernelPtr = CheckImage(*Image);
+        if (KernelPtr)
+          break;
+      }
+    }
+  }
+
+  // If we didn't find the kernel and it was optional, we do not emit an error.
+  if (!KernelPtr && !KernelFound && Optional)
+    return nullptr;
+  // If we didn't find the kernel and it was not optional, we will emit an
+  // error.
+  if (!KernelPtr && !KernelFound)
+    return Plugin::error("Kernel '%s' not found, searched %zu images",
+                         Name.data(),
+                         ImagePtr ? size_t(1) : LoadedImages.size());
+  // If we found the kernel but couldn't initialize it, we will emit an error.
+  if (!KernelPtr)
+    return Plugin::error(
+        "Kernel '%s' failed to initialize, searched %zu images", Name.data(),
+        ImagePtr ? size_t(1) : LoadedImages.size());
+  // Found the kernel and initialized it.
+  return KernelPtr;
+}
+
 Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
   return createEventImpl(EventPtrStorage);
 }
@@ -2147,20 +2238,14 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
 
   GenericDeviceTy &Device = Image.getDevice();
 
-  auto KernelOrErr = Device.constructKernel(Name);
+  auto KernelOrErr = Device.getKernel(Name, &Image);
   if (Error Err = KernelOrErr.takeError()) {
     REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
     return OFFLOAD_FAIL;
   }
 
-  GenericKernelTy &Kernel = *KernelOrErr;
-  if (auto Err = Kernel.init(Device, Image)) {
-    REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
   // Note that this is not the kernel's device address.
-  *KernelPtr = &Kernel;
+  *KernelPtr = *KernelOrErr;
   return OFFLOAD_SUCCESS;
 }
 
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b6465d61bd033f..2a7b6f844cca05 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -393,22 +393,17 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
-  virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
-                                       DeviceImageTy &Image) override {
-    // Check for the presense of global destructors at initialization time. This
-    // is required when the image may be deallocated before destructors are run.
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
-      Image.setPendingGlobalDtors();
-
-    return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
+  virtual Expected<StringRef>
+  getGlobalConstructorName(DeviceImageTy &Image) override {
+    if (auto Err = prepareGlobalCtorDtorCommon(Image, /*IsCtor=*/true))
+      return Err;
+    return "nvptx$device$init";
   }
-
-  virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
-                                      DeviceImageTy &Image) override {
-    if (Image.hasPendingGlobalDtors())
-      return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
-    return Plugin::success();
+  virtual Expected<StringRef>
+  getGlobalDestructorName(DeviceImageTy &Image) override {
+    if (auto Err = prepareGlobalCtorDtorCommon(Image, /*IsCtor=*/false))
+      return Err;
+    return "nvptx$device$fini";
   }
 
   Expected<std::unique_ptr<MemoryBuffer>>
@@ -471,13 +466,14 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate and construct a CUDA kernel.
-  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+  Expected<GenericKernelTy &>
+  constructKernelImpl(llvm::StringRef Name) override {
     // Allocate and construct the CUDA kernel.
     CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
     if (!CUDAKernel)
       return Plugin::error("Failed to allocate memory for CUDA kernel");
 
-    new (CUDAKernel) CUDAKernelTy(Name);
+    new (CUDAKernel) CUDAKernelTy(Name.data());
 
     return *CUDAKernel;
   }
@@ -1149,8 +1145,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
   using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
   using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
 
-  Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
-                                 bool IsCtor) {
+  Error prepareGlobalCtorDtorCommon(DeviceImageTy &Image, bool IsCtor) {
     const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini";
     // Perform a quick check for the named kernel in the image. The kernel
     // should be created by the 'nvptx-lower-ctor-dtor' pass.
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index fe296b77c7d557..604b2648b6d629 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -151,13 +151,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   std::string getComputeUnitKind() const override { return "generic-64bit"; }
 
   /// Construct the kernel for a specific image on the device.
-  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
+  Expected<GenericKernelTy &>
+  constructKernelImpl(llvm::StringRef Name) override {
     // Allocate and construct the kernel.
     GenELF64KernelTy *GenELF64Kernel = Plugin.allocate<GenELF64KernelTy>();
     if (!GenELF64Kernel)
       return Plugin::error("Failed to allocate memory for GenELF64 kernel");
 
-    new (GenELF64Kernel) GenELF64KernelTy(Name);
+    new (GenELF64Kernel) GenELF64KernelTy(Name.data());
 
     return *GenELF64Kernel;
   }
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index c5f5d902fad14c..75c3ea68eed107 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -62,6 +62,23 @@ endforeach()
 target_compile_options(omptarget PRIVATE ${offload_compile_flags})
 target_link_options(omptarget PRIVATE ${offload_link_flags})
 
+add_llvm_library(offload.kernels
+  STATIC
+
+  Kernels/Memory.cpp
+
+  LINK_LIBS
+  PUBLIC
+  omptarget.devicertl
+
+  NO_INSTALL_RPATH
+  BUILDTREE_ONLY
+)
+
+list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS)
+target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -foffload-lto -foffload-via-llvm )
+target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -foffload-lto -foffload-via-llvm )
+
 # libomptarget.so needs to be aware of where the plugins live as they
 # are now separated in the build directory.
 set_target_properties(omptarget PROPERTIES
@@ -69,3 +86,4 @@ set_target_properties(omptarget PROPERTIES
                       INSTALL_RPATH "$ORIGIN"
                       BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
 install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
+install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
diff --git a/offload/src/Kernels/Memory.cpp b/offload/src/Kernels/Memory.cpp
new file mode 100644
index 00000000000000..94777872106b05
--- /dev/null
+++ b/offload/src/Kernels/Memory.cpp
@@ -0,0 +1,53 @@
+//===-- Kenrels/Memory.cpp - Memory related kernel definitions ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+
+#define LAUNCH_BOUNDS(MIN, MAX)                                                \
+  __attribute__((launch_bounds(MAX), amdgpu_flat_work_group_size(MIN, MAX)))
+#define INLINE [[clang::always_inline]] inline
+#define KERNEL [[gnu::weak]] __global__
+#define DEVICE __device__
+
+extern "C" {
+DEVICE int ompx_thread_id(int Dim);
+DEVICE int ompx_block_id(int Dim);
+DEVICE int ompx_block_dim(int Dim);
+DEVICE int ompx_grid_dim(int Dim);
+}
+
+namespace {
+INLINE
+DEVICE void __memset_impl(char *Ptr, int ByteVal, size_t NumBytes) {
+  int TId = ompx_thread_id(0);
+  int BId = ompx_block_id(0);
+  int BDim = ompx_block_dim(0);
+  size_t GId = BId * BDim + TId;
+  if (GId < NumBytes)
+    Ptr[GId] = ByteVal;
+}
+} // namespace
+
+extern "C" {
+KERNEL void LAUNCH_BOUNDS(1, 256)
+    __memset(char *Ptr, int ByteVal, size_t NumBytes) {
+  __memset_impl(Ptr, ByteVal, NumBytes);
+}
+
+KERNEL void LAUNCH_BOUNDS(1, 256)
+    __memset_zero(char *Ptr, int ByteVal, size_t NumBytes) {
+  __memset_impl(Ptr, 0, NumBytes);
+}
+
+KERNEL void LAUNCH_BOUNDS(1, 256)
+    __memset_ones(char *Ptr, int ByteVal, size_t NumBytes) {
+  __memset_impl(Ptr, ~0, NumBytes);
+}
+}
diff --git a/offload/src/OpenMP/API.cpp b/offload/src/OpenMP/API.cpp
index e59bdba8abf0e4..210cadab25edee 100644
--- a/offload/src/OpenMP/API.cpp
+++ b/offload/src/OpenMP/API.cpp
@@ -392,25 +392,52 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
     DP("filling memory on host via memset");
     memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
   } else {
-    // TODO: replace the omp_target_memset() slow path with the fast path.
-    // That will require the ability to execute a kernel from within
-    // libomptarget.so (which we do not have at the moment).
-
-    // This is a very slow path: create a filled array on the host and upload
-    // it to the GPU device.
-    int InitialDevice = omp_get_initial_device();
-    void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
-    if (Shadow) {
-      (void)memset(Shadow, ByteVal, NumBytes);
-      (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
-                              InitialDevice);
-      (void)omp_target_free(Shadow, InitialDevice);
-    } else {
-      // If the omp_target_alloc has failed, let's just not do anything.
-      // omp_target_memset does not have any good way to fail, so we
-      // simply avoid a catastrophic failure of the process for now.
-      DP("omp_target_memset failed to fill memory due to error with "
-         "omp_target_alloc");
+    struct LaunchArgsTy {
+      void *Ptr;
+      int ByteVal;
+      size_t NumBytes;
+    } LaunchArgs{Ptr, ByteVal, NumBytes};
+
+    auto NumThreads = NumBytes > 256 ? 256 : NumBytes;
+    auto NumBlocks = (NumBytes + 255) / 256;
+    const char *KernelName = "__memset";
+    switch (ByteVal) {
+    case 0:
+      KernelName = "__memset_zero";
+      break;
+    case ~0:
+      KernelName = "__memset_ones";
+      break;
+    default:
+      break;
+    };
+    // Try to launch the __memset kernel first.
+    KernelArgsTy KernelArgs;
+    KernelArgs.NumTeams[0] = NumBlocks;
+    KernelArgs.ThreadLimit[0] = NumThreads;
+    struct {
+      size_t LaunchArgsSize;
+      void *LaunchArgs;
+    } WrappedLaunchArgs = {sizeof(LaunchArgs), &LaunchArgs};
+    KernelArgs.ArgPtrs = reinterpret_cast<void **>(&WrappedLaunchArgs);
+    KernelArgs.Flags.IsCUDA = true;
+    if (__tgt_launch_by_name(nullptr, DeviceNum, KernelName, &KernelArgs)) {
+      // This is a very slow path: create a filled array on the host and upload
+      // it to the GPU device.
+      int InitialDevice = omp_get_initial_device();
+      void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
+      if (Shadow) {
+        (void)memset(Shadow, ByteVal, NumBytes);
+        (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
+                                InitialDevice);
+        (void)omp_target_free(Shadow, InitialDevice);
+      } else {
+        // If the omp_target_alloc has failed, let's just not do anything.
+        // omp_target_memset does not have any good way to fail, so we
+        // simply avoid a catastrophic failure of the process for now.
+        DP("omp_target_memset failed to fill memory due to error with "
+           "omp_target_alloc");
+      }
     }
   }
 
diff --git a/offload/src/device.cpp b/offload/src/device.cpp
index 943c7782787306..9b782009d08cd0 100644
--- a/offload/src/device.cpp
+++ b/offload/src/device.cpp
@@ -226,6 +226,17 @@ bool DeviceTy::printDeviceInfo() {
   return true;
 }
 
+int32_t DeviceTy::getKernelHandle(llvm::StringRef Name, void **HandlePtr) {
+  auto KernelOrErr = RTL->getDevice(RTLDeviceID).getKernel(Name);
+  if (!KernelOrErr) {
+    [[maybe_unused]] auto ErrStr = toString(KernelOrErr.takeError());
+    DP("%s\n", ErrStr.c_str());
+    return OFFLOAD_FAIL;
+  }
+  *HandlePtr = *KernelOrErr;
+  return OFFLOAD_SUCCESS;
+}
+
 // Whether data can be copied to DstDevice directly
 bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
   if (RTL != DstDevice.RTL)
diff --git a/offload/src/exports b/offload/src/exports
index 7bdc7d2a531bb3..b7671dd1421bd6 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -27,6 +27,7 @@ VERS1.0 {
     __tgt_target_nowait_mapper;
     __tgt_target_teams_nowait_mapper;
     __tgt_target_kernel;
+    __tgt_launch_by_name;
     __tgt_target_kernel_nowait;
     __tgt_target_nowait_query;
     __tgt_target_kernel_replay;
diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp
index 21f9114ac2b088..dad643187fba26 100644
--- a/offload/src/interface.cpp
+++ b/offload/src/interface.cpp
@@ -394,6 +394,21 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                    HostPtr, KernelArgs);
 }
 
+EXTERN int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId,
+                                const char *KernelName,
+                                KernelArgsTy *KernelArgs) {
+  auto DeviceOrErr = PM->getDevice(DeviceId);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+  auto &Device = *DeviceOrErr;
+  void *Handle;
+  if (Device.getKernelHandle(KernelName, &Handle))
+    return OFFLOAD_FAIL;
+  AsyncInfoTy AsyncInfo(*DeviceOrErr);
+  return DeviceOrErr->launchKernel(Handle, nullptr, nullptr, *KernelArgs,
+                                   AsyncInfo);
+}
+
 /// Activates the record replay mechanism.
 /// \param DeviceId The device identifier to execute the target region.
 /// \param MemorySize The number of bytes to be (pre-)allocated
diff --git a/offload/test/jit/type_punning.c b/offload/test/jit/type_punning.c
index 574168b8a69cbb..c2cd415a5fc75f 100644
--- a/offload/test/jit/type_punning.c
+++ b/offload/test/jit/type_punning.c
@@ -13,8 +13,8 @@
 // Ensure that there is only the kernel function left, not any outlined
 // parallel regions.
 //
-// CHECK: define
-// CHECK-NOT: define
+// CHECK:     define {{.*}}__omp_offloading_
+// CHECK-NOT: call {{.*}}@__
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index b4fc7d3b333b35..907300096f1665 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -179,7 +179,10 @@ def remove_suffix_if_present(name):
         return name
 
 def add_libraries(source):
-    return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
+    source += " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
+    source += " " + config.llvm_library_intdir + "/liboffload.kernels.a"
+    return source
+
 
 # Add platform targets
 host_targets = [
diff --git a/offload/test/offloading/kernels_memset.c b/offload/test/offloading/kernels_memset.c
new file mode 100644
index 00000000000000..4cdbd56c366a88
--- /dev/null
+++ b/offload/test/offloading/kernels_memset.c
@@ -0,0 +1,61 @@
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
+// RUN:   %fcheck-generic
+//
+// REQUIRES: gpu
+
+#include <omp.h>
+
+int main(int argc, char *argv[]) {
+  const int num_blocks = 64;
+  const int block_size = 256;
+  const int N = num_blocks * block_size;
+  int *data =
+      (int *)omp_target_alloc(N * sizeof(int), omp_get_default_device());
+
+  // clang-format off
+  // CHECK: Launching kernel __memset_zero with 256 blocks and 256 threads in SPMD mode
+  // CHECK: Launching kernel __omp_offloading{{.*}} with 64 blocks and 256 threads in SPMD mode
+  omp_target_memset(data, '\0', N * sizeof(int), omp_get_default_device());
+  // clang-format on
+
+#pragma omp target teams num_teams(num_blocks) thread_limit(block_size)
+  {
+#pragma omp parallel
+    if (data[omp_get_team_num() * omp_get_num_threads() +
+             omp_get_thread_num()] != 0)
+      __builtin_trap();
+  }
+
+  // clang-format off
+  // CHECK: Launching kernel __memset_ones with 256 blocks and 256 threads in SPMD mode
+  // CHECK: Launching kernel __omp_offloading{{.*}} with 64 blocks and 256 threads in SPMD mode
+  omp_target_memset(data, ~0, N * sizeof(int), omp_get_default_device());
+  // clang-format on
+
+#pragma omp target teams num_teams(num_blocks) thread_limit(block_size)
+  {
+#pragma omp parallel
+    if (data[omp_get_team_num() * omp_get_num_threads() +
+             omp_get_thread_num()] != ~0)
+      __builtin_trap();
+  }
+
+  // clang-format off
+  // CHECK: Launching kernel __memset with 256 blocks and 256 threads in SPMD mode
+  // CHECK: Launching kernel __omp_offloading{{.*}} with 256 blocks and 256 threads in SPMD mode
+  omp_target_memset(data, '$', N * sizeof(int), omp_get_default_device());
+  // clang-format on
+
+  char *cdata = (char *)data;
+#pragma omp target teams num_teams(num_blocks * sizeof(int))                   \
+    thread_limit(block_size)
+  {
+#pragma omp parallel
+    if (cdata[omp_get_team_num() * omp_get_num_threads() +
+              omp_get_thread_num()] != '$')
+      __builtin_trap();
+  }
+
+  return 0;
+}