[Openmp-commits] [openmp] b8e297d - [OpenMP][libomptarget] Improve kernel initialization in plugins

Kevin Sala via Openmp-commits openmp-commits at lists.llvm.org
Sun Aug 6 03:00:22 PDT 2023


Author: Kevin Sala
Date: 2023-08-06T11:53:58+02:00
New Revision: b8e297d1af5ae42e81b4c79e14a6d2427db0311b

URL: https://github.com/llvm/llvm-project/commit/b8e297d1af5ae42e81b4c79e14a6d2427db0311b
DIFF: https://github.com/llvm/llvm-project/commit/b8e297d1af5ae42e81b4c79e14a6d2427db0311b.diff

LOG: [OpenMP][libomptarget] Improve kernel initialization in plugins

This patch modifies the plugins so that the initialization of KernelTy objects
is done in the init method. Part of the initialization was done in the
constructKernelEntry method. Now this method is called constructKernel
and only allocates and constructs a KernelTy object.

This patch prepares the kernel class for the new implementation of device
reductions.

Differential Revision: https://reviews.llvm.org/D156917

Added: 
    

Modified: 
    openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
    openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
    openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 0329ebdb019ad9..3d38e0fe8e61bb 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1899,20 +1899,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   uint64_t getClockFrequency() const override { return ClockFrequency; }
 
   /// Allocate and construct an AMDGPU kernel.
-  Expected<GenericKernelTy *>
-  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
-                       DeviceImageTy &Image) override {
+  Expected<GenericKernelTy &>
+  constructKernel(const __tgt_offload_entry &KernelEntry,
+                  OMPTgtExecModeFlags ExecMode) override {
+    // Allocate and construct the AMDGPU kernel.
+    AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
+    if (!AMDGPUKernel)
+      return Plugin::error("Failed to allocate memory for AMDGPU kernel");
 
-    Expected<OMPTgtExecModeFlags> ExecModeOrErr =
-        getExecutionModeForKernel(KernelEntry.name, Image);
-    if (!ExecModeOrErr)
-      return ExecModeOrErr.takeError();
+    new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name, ExecMode);
 
-    // Allocate and initialize the AMDGPU kernel.
-    AMDGPUKernelTy *AMDKernel = Plugin::get().allocate<AMDGPUKernelTy>();
-    new (AMDKernel) AMDGPUKernelTy(KernelEntry.name, ExecModeOrErr.get());
-
-    return AMDKernel;
+    return *AMDGPUKernel;
   }
 
   /// Set the current context to this device's context. Do nothing since the

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 0834c9f932db43..0f2bb07818039f 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -746,21 +746,25 @@ Error GenericDeviceTy::registerKernelOffloadEntry(
     __tgt_offload_entry &DeviceEntry) {
   DeviceEntry = KernelEntry;
 
+  // Retrieve the execution mode.
+  auto ExecModeOrErr = getExecutionModeForKernel(KernelEntry.name, Image);
+  if (!ExecModeOrErr)
+    return ExecModeOrErr.takeError();
+
   // Create a kernel object.
-  auto KernelOrErr = constructKernelEntry(KernelEntry, Image);
+  auto KernelOrErr = constructKernel(KernelEntry, *ExecModeOrErr);
   if (!KernelOrErr)
     return KernelOrErr.takeError();
 
-  GenericKernelTy *Kernel = *KernelOrErr;
-  assert(Kernel != nullptr && "Invalid kernel");
+  GenericKernelTy &Kernel = *KernelOrErr;
 
   // Initialize the kernel.
-  if (auto Err = Kernel->init(*this, Image))
+  if (auto Err = Kernel.init(*this, Image))
     return Err;
 
   // Set the device entry address to the kernel address and store the entry on
   // the entry table.
-  DeviceEntry.addr = (void *)Kernel;
+  DeviceEntry.addr = (void *)&Kernel;
   Image.getOffloadEntryTable().addEntry(DeviceEntry);
 
   return Plugin::success();

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 9521323eaf3fbd..9cbef5a247f360 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -794,9 +794,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                                    __tgt_offload_entry &DeviceEntry);
 
   /// Allocate and construct a kernel object.
-  virtual Expected<GenericKernelTy *>
-  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
-                       DeviceImageTy &Image) = 0;
+  virtual Expected<GenericKernelTy &>
+  constructKernel(const __tgt_offload_entry &KernelEntry,
+                  OMPTgtExecModeFlags ExecMode) = 0;
 
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -837,8 +837,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
 protected:
   /// Return the execution mode used for kernel \p Name.
-  Expected<OMPTgtExecModeFlags> getExecutionModeForKernel(StringRef Name,
-                                                          DeviceImageTy &Image);
+  virtual Expected<OMPTgtExecModeFlags>
+  getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image);
 
   /// Environment variables defined by the LLVM OpenMP implementation
   /// regarding the initial number of streams and events.

diff  --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index a58ec028fa4812..e14cfc5deda10f 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -37,31 +37,80 @@ struct CUDAKernelTy;
 struct CUDADeviceTy;
 struct CUDAPluginTy;
 
+/// Class implementing the CUDA device images properties.
+struct CUDADeviceImageTy : public DeviceImageTy {
+  /// Create the CUDA image with the id and the target image pointer.
+  CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
+      : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {}
+
+  /// Load the image as a CUDA module.
+  Error loadModule() {
+    assert(!Module && "Module already loaded");
+
+    CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
+    if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s"))
+      return Err;
+
+    return Plugin::success();
+  }
+
+  /// Unload the CUDA module corresponding to the image.
+  Error unloadModule() {
+    assert(Module && "Module not loaded");
+
+    CUresult Res = cuModuleUnload(Module);
+    if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s"))
+      return Err;
+
+    Module = nullptr;
+
+    return Plugin::success();
+  }
+
+  /// Getter of the CUDA module.
+  CUmodule getModule() const { return Module; }
+
+private:
+  /// The CUDA module that loaded the image.
+  CUmodule Module;
+};
+
 /// Class implementing the CUDA kernel functionalities which derives from the
 /// generic kernel class.
 struct CUDAKernelTy : public GenericKernelTy {
-  /// Create a CUDA kernel with a name, an execution mode, and the kernel
-  /// function.
-  CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode,
-               CUfunction Func)
-      : GenericKernelTy(Name, ExecutionMode), Func(Func) {}
+  /// Create a CUDA kernel with a name and an execution mode.
+  CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
+      : GenericKernelTy(Name, ExecMode), Func(nullptr) {}
 
-  /// Initialize the CUDA kernel
+  /// Initialize the CUDA kernel.
   Error initImpl(GenericDeviceTy &GenericDevice,
                  DeviceImageTy &Image) override {
+    CUresult Res;
+    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
+
+    // Retrieve the function pointer of the kernel.
+    Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName());
+    if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s",
+                                 getName()))
+      return Err;
+
+    // Check that the function pointer is valid.
+    if (!Func)
+      return Plugin::error("Invalid function for kernel %s", getName());
+
     int MaxThreads;
-    CUresult Res = cuFuncGetAttribute(
-        &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
+    Res = cuFuncGetAttribute(&MaxThreads,
+                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
     if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
       return Err;
 
-    /// Set the maximum number of threads for the CUDA kernel.
+    // The maximum number of threads cannot exceed the maximum of the kernel.
     MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
 
     return Plugin::success();
   }
 
-  /// Launch the CUDA kernel function
+  /// Launch the CUDA kernel function.
   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
                    uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
@@ -165,44 +214,6 @@ struct CUDAEventRef final : public GenericDeviceResourceRef {
   HandleTy Event;
 };
 
-/// Class implementing the CUDA device images properties.
-struct CUDADeviceImageTy : public DeviceImageTy {
-  /// Create the CUDA image with the id and the target image pointer.
-  CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {}
-
-  /// Load the image as a CUDA module.
-  Error loadModule() {
-    assert(!Module && "Module already loaded");
-
-    CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
-    if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s"))
-      return Err;
-
-    return Plugin::success();
-  }
-
-  /// Unload the CUDA module corresponding to the image.
-  Error unloadModule() {
-    assert(Module && "Module not loaded");
-
-    CUresult Res = cuModuleUnload(Module);
-    if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s"))
-      return Err;
-
-    Module = nullptr;
-
-    return Plugin::success();
-  }
-
-  /// Getter of the CUDA module.
-  CUmodule getModule() const { return Module; }
-
-private:
-  /// The CUDA module that loaded the image.
-  CUmodule Module;
-};
-
 /// Class implementing the CUDA device functionalities which derives from the
 /// generic device class.
 struct CUDADeviceTy : public GenericDeviceTy {
@@ -330,32 +341,17 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate and construct a CUDA kernel.
-  Expected<GenericKernelTy *>
-  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
-                       DeviceImageTy &Image) override {
-    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
-
-    // Retrieve the function pointer of the kernel.
-    CUfunction Func;
-    CUresult Res =
-        cuModuleGetFunction(&Func, CUDAImage.getModule(), KernelEntry.name);
-    if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s",
-                                 KernelEntry.name))
-      return std::move(Err);
-
-    DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(&KernelEntry),
-       KernelEntry.name, DPxPTR(Func));
-
-    Expected<OMPTgtExecModeFlags> ExecModeOrErr =
-        getExecutionModeForKernel(KernelEntry.name, Image);
-    if (!ExecModeOrErr)
-      return ExecModeOrErr.takeError();
-
-    // Allocate and initialize the CUDA kernel.
+  Expected<GenericKernelTy &>
+  constructKernel(const __tgt_offload_entry &KernelEntry,
+                  OMPTgtExecModeFlags ExecMode) override {
+    // Allocate and construct the CUDA kernel.
     CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
-    new (CUDAKernel) CUDAKernelTy(KernelEntry.name, ExecModeOrErr.get(), Func);
+    if (!CUDAKernel)
+      return Plugin::error("Failed to allocate memory for CUDA kernel");
+
+    new (CUDAKernel) CUDAKernelTy(KernelEntry.name, ExecMode);
 
-    return CUDAKernel;
+    return *CUDAKernel;
   }
 
   /// Set the current context to this device's context.

diff  --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 6652409dc8071a..e48d1bbd65e907 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -49,14 +49,27 @@ using llvm::sys::DynamicLibrary;
 
 /// Class implementing kernel functionalities for GenELF64.
 struct GenELF64KernelTy : public GenericKernelTy {
-  /// Construct the kernel with a name, execution mode and a function.
-  GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode,
-                   void (*Func)(void))
-      : GenericKernelTy(Name, ExecutionMode), Func(Func) {}
+  /// Construct the kernel with a name and an execution mode.
+  GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
+      : GenericKernelTy(Name, ExecMode), Func(nullptr) {}
 
   /// Initialize the kernel.
-  Error initImpl(GenericDeviceTy &GenericDevice,
-                 DeviceImageTy &Image) override {
+  Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
+    // Functions have zero size.
+    GlobalTy Global(getName(), 0);
+
+    // Get the metadata (address) of the kernel function.
+    GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
+    if (auto Err = GHandler.getGlobalMetadataFromDevice(Device, Image, Global))
+      return Err;
+
+    // Check that the function pointer is valid.
+    if (!Global.getPtr())
+      return Plugin::error("Invalid function for kernel %s", getName());
+
+    // Save the function pointer.
+    Func = (void (*)())Global.getPtr();
+
     // Set the maximum number of threads to a single.
     MaxNumThreads = 1;
     return Plugin::success();
@@ -119,23 +132,18 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error deinitImpl() override { return Plugin::success(); }
 
   /// Construct the kernel for a specific image on the device.
-  Expected<GenericKernelTy *>
-  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
-                       DeviceImageTy &Image) override {
-    GlobalTy Func(KernelEntry);
-
-    // Get the metadata (address) of the kernel function.
-    GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
-    if (auto Err = GHandler.getGlobalMetadataFromDevice(*this, Image, Func))
-      return std::move(Err);
-
-    // Allocate and create the kernel.
+  Expected<GenericKernelTy &>
+  constructKernel(const __tgt_offload_entry &KernelEntry,
+                  OMPTgtExecModeFlags ExecMode) override {
+    // Allocate and construct the kernel.
     GenELF64KernelTy *GenELF64Kernel =
         Plugin::get().allocate<GenELF64KernelTy>();
-    new (GenELF64Kernel) GenELF64KernelTy(
-        KernelEntry.name, OMP_TGT_EXEC_MODE_GENERIC, (void (*)())Func.getPtr());
+    if (!GenELF64Kernel)
+      return Plugin::error("Failed to allocate memory for GenELF64 kernel");
 
-    return GenELF64Kernel;
+    new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name, ExecMode);
+
+    return *GenELF64Kernel;
   }
 
   /// Set the current context to this device, which is a no-op.
@@ -312,6 +320,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   }
   Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
 
+protected:
+  /// Retrieve the execution mode for kernels. All kernels use the generic mode.
+  Expected<OMPTgtExecModeFlags>
+  getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image) override {
+    return OMP_TGT_EXEC_MODE_GENERIC;
+  }
+
 private:
   /// Grid values for Generic ELF64 plugins.
   static constexpr GV GenELF64GridValues = {


        


More information about the Openmp-commits mailing list