[Openmp-commits] [openmp] 621bafd - [Libomptarget] Move target table handling out of the plugins (#77150)

via Openmp-commits openmp-commits at lists.llvm.org
Mon Jan 22 09:06:51 PST 2024


Author: Joseph Huber
Date: 2024-01-22T11:06:47-06:00
New Revision: 621bafd5c14cc324612e32c8123ac1ebf1c0530b

URL: https://github.com/llvm/llvm-project/commit/621bafd5c14cc324612e32c8123ac1ebf1c0530b
DIFF: https://github.com/llvm/llvm-project/commit/621bafd5c14cc324612e32c8123ac1ebf1c0530b.diff

LOG: [Libomptarget] Move target table handling out of the plugins (#77150)

Summary:
This patch removes the bulk of the handling of the
`__tgt_offload_entries` out of the plugins itself. The reason for this
is because the plugins themselves should not be handling this
implementation detail of the OpenMP runtime. Instead, we expose two new
plugin API functions to get the points to a device pointer for a global
as well as a kernel type.

This required introducing a new type to represent a binary image that
has been loaded on a device. We can then use this to load the addresses
as needed. The creation of the mapping table is then handled just in
`libomptarget` where we simply look up each address individually. This
should allow us to expose these operations more generically when we
provide a separate API.

Added: 
    

Modified: 
    openmp/libomptarget/include/Shared/APITypes.h
    openmp/libomptarget/include/Shared/PluginAPI.h
    openmp/libomptarget/include/Shared/PluginAPI.inc
    openmp/libomptarget/include/device.h
    openmp/libomptarget/include/rtl.h
    openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
    openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
    openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
    openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
    openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
    openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
    openmp/libomptarget/src/PluginManager.cpp
    openmp/libomptarget/src/device.cpp
    openmp/libomptarget/src/omptarget.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/include/Shared/APITypes.h b/openmp/libomptarget/include/Shared/APITypes.h
index 763a22f0a5e8636..94521b4fbb57790 100644
--- a/openmp/libomptarget/include/Shared/APITypes.h
+++ b/openmp/libomptarget/include/Shared/APITypes.h
@@ -62,6 +62,11 @@ struct __tgt_target_table {
       *EntriesEnd; // End of the table with all the entries (non inclusive)
 };
 
+/// This struct contains a handle to a loaded binary in the plugin device.
+struct __tgt_device_binary {
+  uintptr_t handle;
+};
+
 // clang-format on
 
 /// This struct contains information exchanged between 
diff erent asynchronous

diff  --git a/openmp/libomptarget/include/Shared/PluginAPI.h b/openmp/libomptarget/include/Shared/PluginAPI.h
index aece53d7ee1caab..5de5f106045b555 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.h
+++ b/openmp/libomptarget/include/Shared/PluginAPI.h
@@ -57,8 +57,18 @@ int32_t __tgt_rtl_init_device(int32_t ID);
 // return NULL. Otherwise, return a pointer to the built address table.
 // Individual entries in the table may also be NULL, when the corresponding
 // offload region is not supported on the target device.
-__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
-                                          __tgt_device_image *Image);
+int32_t __tgt_rtl_load_binary(int32_t ID, __tgt_device_image *Image,
+                              __tgt_device_binary *Binary);
+
+// Look up the device address of the named symbol in the given binary. Returns
+// non-zero on failure.
+int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size,
+                             const char *Name, void **DevicePtr);
+
+// Look up the device address of the named kernel in the given binary. Returns
+// non-zero on failure.
+int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name,
+                               void **DevicePtr);
 
 // Allocate data on the particular target device, of the specified size.
 // HostPtr is a address of the host data the allocated target data

diff  --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc
index b842c6eef1d4fc7..5f8a9dd11fdcefb 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.inc
+++ b/openmp/libomptarget/include/Shared/PluginAPI.inc
@@ -19,6 +19,8 @@ PLUGIN_API_HANDLE(is_data_exchangable, false);
 PLUGIN_API_HANDLE(number_of_devices, true);
 PLUGIN_API_HANDLE(init_device, true);
 PLUGIN_API_HANDLE(load_binary, true);
+PLUGIN_API_HANDLE(get_global, true);
+PLUGIN_API_HANDLE(get_function, true);
 PLUGIN_API_HANDLE(data_alloc, true);
 PLUGIN_API_HANDLE(data_submit, true);
 PLUGIN_API_HANDLE(data_submit_async, false);

diff  --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h
index 3023fba6cc64db5..1dc82e36f6813d5 100644
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -70,7 +70,7 @@ struct DeviceTy {
   /// Provide access to the mapping handler.
   MappingInfoTy &getMappingInfo() { return MappingInfo; }
 
-  __tgt_target_table *loadBinary(__tgt_device_image *Img);
+  llvm::Expected<__tgt_device_binary> loadBinary(__tgt_device_image *Img);
 
   // device memory allocation/deallocation routines
   /// Allocates \p Size bytes on the device, host or shared memory space

diff  --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h
index d110e89de5f14ea..5e198bdad43642f 100644
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@@ -26,11 +26,16 @@
 /// are trying to (re)register an existing lib or really have a new one.
 struct TranslationTable {
   __tgt_target_table HostTable;
+  llvm::SmallVector<__tgt_target_table> DeviceTables;
 
   // Image assigned to a given device.
   llvm::SmallVector<__tgt_device_image *>
       TargetsImages; // One image per device ID.
 
+  // Arrays of entries active on the device.
+  llvm::SmallVector<llvm::SmallVector<__tgt_offload_entry>>
+      TargetsEntries; // One table per device ID.
+
   // Table of entry points or NULL if it was not already computed.
   llvm::SmallVector<__tgt_target_table *>
       TargetsTable; // One table per device ID.

diff  --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 8066a231ef93f2c..81634ae1edc4908 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -439,8 +439,9 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
 /// Class implementing the AMDGPU device images' properties.
 struct AMDGPUDeviceImageTy : public DeviceImageTy {
   /// Create the AMDGPU image with the id and the target image pointer.
-  AMDGPUDeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, TgtImage) {}
+  AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
+                      const __tgt_device_image *TgtImage)
+      : DeviceImageTy(ImageId, Device, TgtImage) {}
 
   /// Prepare and load the executable corresponding to the image.
   Error loadExecutable(const AMDGPUDeviceTy &Device);
@@ -2105,14 +2106,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   uint64_t getClockFrequency() const override { return ClockFrequency; }
 
   /// Allocate and construct an AMDGPU kernel.
-  Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry) override {
+  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the AMDGPU kernel.
     AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
     if (!AMDGPUKernel)
       return Plugin::error("Failed to allocate memory for AMDGPU kernel");
 
-    new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name);
+    new (AMDGPUKernel) AMDGPUKernelTy(Name);
 
     return *AMDGPUKernel;
   }
@@ -2160,7 +2160,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Allocate and initialize the image object.
     AMDGPUDeviceImageTy *AMDImage =
         Plugin::get().allocate<AMDGPUDeviceImageTy>();
-    new (AMDImage) AMDGPUDeviceImageTy(ImageId, TgtImage);
+    new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
 
     // Load the HSA executable.
     if (Error Err = AMDImage->loadExecutable(*this))

diff  --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index 8707e7b4c504e33..5c767995126b771 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -47,9 +47,6 @@ class GlobalTy {
   GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr)
       : Name(Name), Size(Size), Ptr(Ptr) {}
 
-  GlobalTy(const __tgt_offload_entry &Entry)
-      : Name(Entry.name), Size(Entry.size), Ptr(Entry.addr) {}
-
   const std::string &getName() const { return Name; }
   uint32_t getSize() const { return Size; }
   void *getPtr() const { return Ptr; }

diff  --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index d55dfbdd9e4c1ed..3c2a4d7e6c0e706 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -182,34 +182,6 @@ class InfoQueueTy {
 /// specific device. This class is responsible for storing and managing
 /// the offload entries for an image on a device.
 class DeviceImageTy {
-
-  /// Class representing the offload entry table. The class stores the
-  /// __tgt_target_table and a map to search in the table faster.
-  struct OffloadEntryTableTy {
-    /// Add new entry to the table.
-    void addEntry(const __tgt_offload_entry &Entry) {
-      Entries.push_back(Entry);
-      TTTablePtr.EntriesBegin = &Entries[0];
-      TTTablePtr.EntriesEnd = TTTablePtr.EntriesBegin + Entries.size();
-    }
-
-    /// Get the raw pointer to the __tgt_target_table.
-    operator __tgt_target_table *() {
-      if (Entries.empty())
-        return nullptr;
-      return &TTTablePtr;
-    }
-
-  private:
-    __tgt_target_table TTTablePtr;
-    llvm::SmallVector<__tgt_offload_entry> Entries;
-
-  public:
-    using const_iterator = decltype(Entries)::const_iterator;
-    const_iterator begin() const { return Entries.begin(); }
-    const_iterator end() const { return Entries.end(); }
-  };
-
   /// Image identifier within the corresponding device. Notice that this id is
   /// not unique between 
diff erent device; they may overlap.
   int32_t ImageId;
@@ -218,18 +190,19 @@ class DeviceImageTy {
   const __tgt_device_image *TgtImage;
   const __tgt_device_image *TgtImageBitcode;
 
+  /// Reference to the device this image is loaded on.
+  GenericDeviceTy &Device;
+
   /// If this image has any global destructors that much be called.
   /// FIXME: This is only required because we currently have no invariants
   ///        towards the lifetime of the underlying image. We should either copy
   ///        the image into memory locally or erase the pointers after init.
   bool PendingGlobalDtors;
 
-  /// Table of offload entries.
-  OffloadEntryTableTy OffloadEntryTable;
-
 public:
-  DeviceImageTy(int32_t Id, const __tgt_device_image *Image)
-      : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr),
+  DeviceImageTy(int32_t Id, GenericDeviceTy &Device,
+                const __tgt_device_image *Image)
+      : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device),
         PendingGlobalDtors(false) {
     assert(TgtImage && "Invalid target image");
   }
@@ -237,6 +210,9 @@ class DeviceImageTy {
   /// Get the image identifier within the device.
   int32_t getId() const { return ImageId; }
 
+  /// Get the device that this image is loaded onto.
+  GenericDeviceTy &getDevice() const { return Device; }
+
   /// Get the pointer to the raw __tgt_device_image.
   const __tgt_device_image *getTgtImage() const { return TgtImage; }
 
@@ -261,13 +237,9 @@ class DeviceImageTy {
     return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
                            "Image");
   }
-
   /// Accessors to the boolean value
   bool setPendingGlobalDtors() { return PendingGlobalDtors = true; }
   bool hasPendingGlobalDtors() const { return PendingGlobalDtors; }
-
-  /// Get a reference to the offload entry table for the image.
-  OffloadEntryTableTy &getOffloadEntryTable() { return OffloadEntryTable; }
 };
 
 /// Class implementing common functionalities of offload kernels. Each plugin
@@ -661,8 +633,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual Error deinitImpl() = 0;
 
   /// Load the binary image into the device and return the target table.
-  Expected<__tgt_target_table *> loadBinary(GenericPluginTy &Plugin,
-                                            const __tgt_device_image *TgtImage);
+  Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
+                                       const __tgt_device_image *TgtImage);
   virtual Expected<DeviceImageTy *>
   loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
 
@@ -680,9 +652,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   // up to the target to override this using the shouldSetupRPCServer function.
   Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
 
-  /// Register the offload entries for a specific image on the device.
-  Error registerOffloadEntries(DeviceImageTy &Image);
-
   /// Synchronize the current thread with the pending operations on the
   /// __tgt_async_info structure.
   Error synchronize(__tgt_async_info *AsyncInfo);
@@ -888,21 +857,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   bool useAutoZeroCopy();
   virtual bool useAutoZeroCopyImpl() { return false; }
 
-private:
-  /// Register offload entry for global variable.
-  Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
-                                   const __tgt_offload_entry &GlobalEntry,
-                                   __tgt_offload_entry &DeviceEntry);
-
-  /// Register offload entry for kernel function.
-  Error registerKernelOffloadEntry(DeviceImageTy &DeviceImage,
-                                   const __tgt_offload_entry &KernelEntry,
-                                   __tgt_offload_entry &DeviceEntry);
-
   /// Allocate and construct a kernel object.
-  virtual Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry) = 0;
+  virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
 
+private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
   /// value to zero for the getters.

diff  --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 6ae30e78ce8c2fd..def9c14fa53f866 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -61,6 +61,14 @@ struct RecordReplayTy {
   bool UsedVAMap = false;
   uintptr_t MemoryOffset = 0;
 
+  // A list of all globals mapped to the device.
+  struct GlobalEntry {
+    const char *Name;
+    uint64_t Size;
+    void *Addr;
+  };
+  llvm::SmallVector<GlobalEntry> GlobalEntries{};
+
   void *suggestAddress(uint64_t MaxMemoryAllocation) {
     // Get a valid pointer address for this system
     void *Addr =
@@ -189,6 +197,9 @@ struct RecordReplayTy {
   }
   void setStatus(RRStatusTy Status) { this->Status = Status; }
   bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
+  void addEntry(const char *Name, uint64_t Size, void *Addr) {
+    GlobalEntries.emplace_back(GlobalEntry{Name, Size, Addr});
+  }
 
   void saveImage(const char *Name, const DeviceImageTy &Image) {
     SmallString<128> ImageName = {Name, ".image"};
@@ -211,12 +222,12 @@ struct RecordReplayTy {
   void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
     int32_t Size = 0;
 
-    for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
-      if (!OffloadEntry.size)
+    for (auto &OffloadEntry : GlobalEntries) {
+      if (!OffloadEntry.Size)
         continue;
       // Get the total size of the string and entry including the null byte.
-      Size += std::strlen(OffloadEntry.name) + 1 + sizeof(uint32_t) +
-              OffloadEntry.size;
+      Size += std::strlen(OffloadEntry.Name) + 1 + sizeof(uint32_t) +
+              OffloadEntry.Size;
     }
 
     ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
@@ -225,26 +236,26 @@ struct RecordReplayTy {
       report_fatal_error("Error creating MemoryBuffer for globals memory");
 
     void *BufferPtr = GlobalsMB.get()->getBufferStart();
-    for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
-      if (!OffloadEntry.size)
+    for (auto &OffloadEntry : GlobalEntries) {
+      if (!OffloadEntry.Size)
         continue;
 
-      int32_t NameLength = std::strlen(OffloadEntry.name) + 1;
-      memcpy(BufferPtr, OffloadEntry.name, NameLength);
+      int32_t NameLength = std::strlen(OffloadEntry.Name) + 1;
+      memcpy(BufferPtr, OffloadEntry.Name, NameLength);
       BufferPtr = advanceVoidPtr(BufferPtr, NameLength);
 
-      *((uint32_t *)(BufferPtr)) = OffloadEntry.size;
+      *((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
       BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t));
 
       auto Err = Plugin::success();
       {
-        if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.addr,
-                                            OffloadEntry.size, nullptr))
+        if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.Addr,
+                                            OffloadEntry.Size, nullptr))
           report_fatal_error("Error retrieving data for global");
       }
       if (Err)
         report_fatal_error("Error retrieving data for global");
-      BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.size);
+      BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size);
     }
     assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
            "Buffer over/under-filled.");
@@ -841,7 +852,7 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
 
   return deinitImpl();
 }
-Expected<__tgt_target_table *>
+Expected<DeviceImageTy *>
 GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
                             const __tgt_device_image *InputTgtImage) {
   assert(InputTgtImage && "Expected non-null target image");
@@ -885,10 +896,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
       return std::move(Err);
   }
 
-  // Register all offload entries of the image.
-  if (auto Err = registerOffloadEntries(*Image))
-    return std::move(Err);
-
   if (auto Err = setupRPCServer(Plugin, *Image))
     return std::move(Err);
 
@@ -909,7 +916,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
     return std::move(Err);
 
   // Return the pointer to the table of entries.
-  return Image->getOffloadEntryTable();
+  return Image;
 }
 
 Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
@@ -1018,99 +1025,6 @@ Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
   return Plugin::success();
 }
 
-Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) {
-  const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin;
-  const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd;
-  for (const __tgt_offload_entry *Entry = Begin; Entry != End; ++Entry) {
-    // The host should have always something in the address to uniquely
-    // identify the entry.
-    if (!Entry->addr)
-      return Plugin::error("Failure to register entry without address");
-
-    __tgt_offload_entry DeviceEntry = {0};
-
-    if (Entry->size) {
-      if (auto Err = registerGlobalOffloadEntry(Image, *Entry, DeviceEntry))
-        return Err;
-    } else {
-      if (auto Err = registerKernelOffloadEntry(Image, *Entry, DeviceEntry))
-        return Err;
-    }
-
-    assert(DeviceEntry.addr && "Device addr of offload entry cannot be null");
-
-    DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
-       DPxPTR(Entry - Begin), (Entry->size) ? " global" : "", Entry->name,
-       DPxPTR(DeviceEntry.addr));
-  }
-  return Plugin::success();
-}
-
-Error GenericDeviceTy::registerGlobalOffloadEntry(
-    DeviceImageTy &Image, const __tgt_offload_entry &GlobalEntry,
-    __tgt_offload_entry &DeviceEntry) {
-
-  GenericPluginTy &Plugin = Plugin::get();
-
-  DeviceEntry = GlobalEntry;
-
-  // Create a metadata object for the device global.
-  GlobalTy DeviceGlobal(GlobalEntry.name, GlobalEntry.size);
-
-  // Get the address of the device of the global.
-  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-  if (auto Err =
-          GHandler.getGlobalMetadataFromDevice(*this, Image, DeviceGlobal))
-    return Err;
-
-  // Store the device address on the device entry.
-  DeviceEntry.addr = DeviceGlobal.getPtr();
-  assert(DeviceEntry.addr && "Invalid device global's address");
-
-  // Note: In the current implementation declare target variables
-  // can either be link or to. This means that once unified
-  // memory is activated via the requires directive, the variable
-  // can be used directly from the host in both cases.
-  if (Plugin.getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
-    // If unified memory is present any target link or to variables
-    // can access host addresses directly. There is no longer a
-    // need for device copies.
-    GlobalTy HostGlobal(GlobalEntry);
-    if (auto Err =
-            GHandler.writeGlobalToDevice(*this, HostGlobal, DeviceGlobal))
-      return Err;
-  }
-
-  // Add the device entry on the entry table.
-  Image.getOffloadEntryTable().addEntry(DeviceEntry);
-
-  return Plugin::success();
-}
-
-Error GenericDeviceTy::registerKernelOffloadEntry(
-    DeviceImageTy &Image, const __tgt_offload_entry &KernelEntry,
-    __tgt_offload_entry &DeviceEntry) {
-  DeviceEntry = KernelEntry;
-
-  // Create a kernel object.
-  auto KernelOrErr = constructKernel(KernelEntry);
-  if (!KernelOrErr)
-    return KernelOrErr.takeError();
-
-  GenericKernelTy &Kernel = *KernelOrErr;
-
-  // Initialize the kernel.
-  if (auto Err = Kernel.init(*this, Image))
-    return Err;
-
-  // Set the device entry address to the kernel address and store the entry on
-  // the entry table.
-  DeviceEntry.addr = (void *)&Kernel;
-  Image.getOffloadEntryTable().addEntry(DeviceEntry);
-
-  return Plugin::success();
-}
-
 Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
                                          size_t Size, bool ExternallyLocked) {
   // Insert the new entry into the map.
@@ -1757,23 +1671,25 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
   return OFFLOAD_SUCCESS;
 }
 
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
-                                          __tgt_device_image *TgtImage) {
+int32_t __tgt_rtl_load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
+                              __tgt_device_binary *Binary) {
   GenericPluginTy &Plugin = Plugin::get();
   GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
 
-  auto TableOrErr = Device.loadBinary(Plugin, TgtImage);
-  if (!TableOrErr) {
-    auto Err = TableOrErr.takeError();
+  auto ImageOrErr = Device.loadBinary(Plugin, TgtImage);
+  if (!ImageOrErr) {
+    auto Err = ImageOrErr.takeError();
     REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
            DeviceId, toString(std::move(Err)).data());
-    return nullptr;
+    return OFFLOAD_FAIL;
   }
 
-  __tgt_target_table *Table = *TableOrErr;
-  assert(Table != nullptr && "Invalid table");
+  DeviceImageTy *Image = *ImageOrErr;
+  assert(Image != nullptr && "Invalid Image");
+
+  *Binary = __tgt_device_binary{reinterpret_cast<uint64_t>(Image)};
 
-  return Table;
+  return OFFLOAD_SUCCESS;
 }
 
 void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
@@ -2077,6 +1993,58 @@ int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) {
     return false;
   return Plugin::get().getDevice(DeviceId).useAutoZeroCopy();
 }
+
+int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size,
+                             const char *Name, void **DevicePtr) {
+  assert(Binary.handle && "Invalid device binary handle");
+  DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
+
+  GenericPluginTy &Plugin = Plugin::get();
+  GenericDeviceTy &Device = Image.getDevice();
+
+  GlobalTy DeviceGlobal(Name, Size);
+  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
+  if (auto Err =
+          GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
+    REPORT("Failure to look up global address: %s\n",
+           toString(std::move(Err)).data());
+    return OFFLOAD_FAIL;
+  }
+
+  *DevicePtr = DeviceGlobal.getPtr();
+  assert(DevicePtr && "Invalid device global's address");
+
+  // Save the loaded globals if we are recording.
+  if (RecordReplay.isRecording())
+    RecordReplay.addEntry(Name, Size, *DevicePtr);
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name,
+                               void **KernelPtr) {
+  assert(Binary.handle && "Invalid device binary handle");
+  DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
+
+  GenericDeviceTy &Device = Image.getDevice();
+
+  auto KernelOrErr = Device.constructKernel(Name);
+  if (Error Err = KernelOrErr.takeError()) {
+    REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
+    return OFFLOAD_FAIL;
+  }
+
+  GenericKernelTy &Kernel = *KernelOrErr;
+  if (auto Err = Kernel.init(Device, Image)) {
+    REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
+    return OFFLOAD_FAIL;
+  }
+
+  // Note that this is not the kernel's device address.
+  *KernelPtr = &Kernel;
+  return OFFLOAD_SUCCESS;
+}
+
 #ifdef __cplusplus
 }
 #endif

diff  --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index ce6b39898ae95a2..5ed73d103584d96 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -77,8 +77,9 @@ CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
 /// Class implementing the CUDA device images properties.
 struct CUDADeviceImageTy : public DeviceImageTy {
   /// Create the CUDA image with the id and the target image pointer.
-  CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {}
+  CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
+                    const __tgt_device_image *TgtImage)
+      : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {}
 
   /// Load the image as a CUDA module.
   Error loadModule() {
@@ -468,14 +469,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate and construct a CUDA kernel.
-  Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry) override {
+  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the CUDA kernel.
     CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
     if (!CUDAKernel)
       return Plugin::error("Failed to allocate memory for CUDA kernel");
 
-    new (CUDAKernel) CUDAKernelTy(KernelEntry.name);
+    new (CUDAKernel) CUDAKernelTy(Name);
 
     return *CUDAKernel;
   }
@@ -530,7 +530,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     // Allocate and initialize the image object.
     CUDADeviceImageTy *CUDAImage = Plugin::get().allocate<CUDADeviceImageTy>();
-    new (CUDAImage) CUDADeviceImageTy(ImageId, TgtImage);
+    new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
 
     // Load the CUDA module.
     if (auto Err = CUDAImage->loadModule())

diff  --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 6466afc543b5684..38fc275804faf12 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -111,8 +111,9 @@ struct GenELF64KernelTy : public GenericKernelTy {
 /// Class implementing the GenELF64 device images properties.
 struct GenELF64DeviceImageTy : public DeviceImageTy {
   /// Create the GenELF64 image with the id and the target image pointer.
-  GenELF64DeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, TgtImage), DynLib() {}
+  GenELF64DeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
+                        const __tgt_device_image *TgtImage)
+      : DeviceImageTy(ImageId, Device, TgtImage), DynLib() {}
 
   /// Getter and setter for the dynamic library.
   DynamicLibrary &getDynamicLibrary() { return DynLib; }
@@ -141,15 +142,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   std::string getComputeUnitKind() const override { return "generic-64bit"; }
 
   /// Construct the kernel for a specific image on the device.
-  Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry) override {
+  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the kernel.
     GenELF64KernelTy *GenELF64Kernel =
         Plugin::get().allocate<GenELF64KernelTy>();
     if (!GenELF64Kernel)
       return Plugin::error("Failed to allocate memory for GenELF64 kernel");
 
-    new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name);
+    new (GenELF64Kernel) GenELF64KernelTy(Name);
 
     return *GenELF64Kernel;
   }
@@ -163,7 +163,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     // Allocate and initialize the image object.
     GenELF64DeviceImageTy *Image =
         Plugin::get().allocate<GenELF64DeviceImageTy>();
-    new (Image) GenELF64DeviceImageTy(ImageId, TgtImage);
+    new (Image) GenELF64DeviceImageTy(ImageId, *this, TgtImage);
 
     // Create a temporary file.
     char TmpFileName[] = "/tmp/tmpfile_XXXXXX";

diff  --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp
index 50059ba23b1a77f..f65ffc47d89a165 100644
--- a/openmp/libomptarget/src/PluginManager.cpp
+++ b/openmp/libomptarget/src/PluginManager.cpp
@@ -192,7 +192,9 @@ static void registerImageIntoTranslationTable(TranslationTable &TT,
       RTL.DeviceOffset + RTL.getNumberOfUserDevices();
 
   if (TT.TargetsTable.size() < TargetsTableMinimumSize) {
+    TT.DeviceTables.resize(TargetsTableMinimumSize, {});
     TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
+    TT.TargetsEntries.resize(TargetsTableMinimumSize, {});
     TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
   }
 

diff  --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 404d7b6174e4aab..9bdc6b7cd8c9afd 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -107,8 +107,14 @@ llvm::Error DeviceTy::init() {
 }
 
 // Load binary to device.
-__tgt_target_table *DeviceTy::loadBinary(__tgt_device_image *Img) {
-  return RTL->load_binary(RTLDeviceID, Img);
+llvm::Expected<__tgt_device_binary>
+DeviceTy::loadBinary(__tgt_device_image *Img) {
+  __tgt_device_binary Binary;
+
+  if (RTL->load_binary(RTLDeviceID, Img, &Binary) != OFFLOAD_SUCCESS)
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "Failed to load binary %p", Img);
+  return Binary;
 }
 
 void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {

diff  --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index eb2ecfc2bc56bc3..04490ab076b65e7 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -168,19 +168,57 @@ static int initLibrary(DeviceTy &Device) {
         Rc = OFFLOAD_FAIL;
         break;
       }
-      // 2) load image into the target table.
-      __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] =
-          Device.loadBinary(Img);
-      // Unable to get table for this image: invalidate image and fail.
-      if (!TargetTable) {
-        REPORT("Unable to generate entries table for device id %d.\n",
-               DeviceId);
-        TransTable->TargetsImages[DeviceId] = 0;
+
+      // 2) Load the image onto the given device.
+      auto BinaryOrErr = Device.loadBinary(Img);
+      if (llvm::Error Err = BinaryOrErr.takeError()) {
+        REPORT("Failed to load image %s\n",
+               llvm::toString(std::move(Err)).c_str());
         Rc = OFFLOAD_FAIL;
         break;
       }
 
-      // Verify whether the two table sizes match.
+      // 3) Create the translation table.
+      llvm::SmallVector<__tgt_offload_entry> &DeviceEntries =
+          TransTable->TargetsEntries[DeviceId];
+      for (__tgt_offload_entry &Entry :
+           llvm::make_range(Img->EntriesBegin, Img->EntriesEnd)) {
+        __tgt_device_binary &Binary = *BinaryOrErr;
+
+        __tgt_offload_entry DeviceEntry = Entry;
+        if (Entry.size) {
+          if (Device.RTL->get_global(Binary, Entry.size, Entry.name,
+                                     &DeviceEntry.addr) != OFFLOAD_SUCCESS)
+            REPORT("Failed to load symbol %s\n", Entry.name);
+
+          // If unified memory is active, the corresponding global is a device
+          // reference to the host global. We need to initialize the pointer on
+          // the deive to point to the memory on the host.
+          if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+            if (Device.RTL->data_submit(DeviceId, DeviceEntry.addr, Entry.addr,
+                                        Entry.size) != OFFLOAD_SUCCESS)
+              REPORT("Failed to write symbol for USM %s\n", Entry.name);
+          }
+        } else {
+          if (Device.RTL->get_function(Binary, Entry.name, &DeviceEntry.addr) !=
+              OFFLOAD_SUCCESS)
+            REPORT("Failed to load kernel %s\n", Entry.name);
+        }
+        DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
+           DPxPTR(Entry.addr), (Entry.size) ? " global" : "", Entry.name,
+           DPxPTR(DeviceEntry.addr));
+
+        DeviceEntries.emplace_back(DeviceEntry);
+      }
+
+      // Set the storage for the table and get a pointer to it.
+      __tgt_target_table DeviceTable{&DeviceEntries[0],
+                                     &DeviceEntries[0] + DeviceEntries.size()};
+      TransTable->DeviceTables[DeviceId] = DeviceTable;
+      __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] =
+          &TransTable->DeviceTables[DeviceId];
+
+      // 4) Verify whether the two table sizes match.
       size_t Hsize =
           TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
       size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;


        


More information about the Openmp-commits mailing list