[Openmp-commits] [openmp] [llvm] [clang] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)

Fri Nov 10 09:32:00 PST 2023

================
@@ -1038,6 +1048,109 @@ struct CUDADeviceTy : public GenericDeviceTy {
   using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
   using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
 
+  Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
+                                 bool IsCtor) {
+    // Perform a quick check for the named kernel in the image. The kernel
+    // should be created by the 'nvptx-lower-ctor-dtor' pass.
+    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
+    GlobalTy Global(IsCtor ? "nvptx$device$init" : "nvptx$device$fini",
+                    sizeof(void *));
+    if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) {
+      consumeError(std::move(Err));
+      return Plugin::success();
+    }
+
+    // The Nvidia backend cannot handle creating the ctor / dtor array
+    // automatically so we must create it ourselves. The backend will emit
+    // several globals that contain function pointers we can call. These are
+    // prefixed with a known name due to Nvidia's lack of section support.
+    const ELF64LEObjectFile *ELFObj =
+        Handler.getOrCreateELFObjectFile(*this, Image);
+    if (!ELFObj)
+      return Plugin::error("Unable to create ELF object for image %p",
+                           Image.getStart());
+
+    // Search for all symbols that contain a constructor or destructor.
+    SmallVector<std::pair<StringRef, uint16_t>> Funcs;
+    for (ELFSymbolRef Sym : ELFObj->symbols()) {
+      auto NameOrErr = Sym.getName();
+      if (!NameOrErr)
+        return NameOrErr.takeError();
+
+      if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
+                                         : "__fini_array_object_"))
+        continue;
+
+      uint16_t priority;
+      if (NameOrErr->rsplit('_').second.getAsInteger(10, priority))
+        return Plugin::error("Invalid priority for constructor or destructor");
+
+      Funcs.emplace_back(*NameOrErr, priority);
+    }
+
+    // Sort the created array to be in priority order.
+    llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; });
+
+    // Allocate a buffer to store all of the known constructor / destructor
+    // functions in so we can iterate them on the device.
+    void *Buffer =
+        allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_SHARED);
+    if (!Buffer)
+      return Plugin::error("Failed to allocate memory for global buffer");
+
+    auto *GlobalPtrStart = reinterpret_cast<uintptr_t *>(Buffer);
+    auto *GlobalPtrStop = reinterpret_cast<uintptr_t *>(Buffer) + Funcs.size();
+
+    std::size_t Idx = 0;
+    for (auto [Name, Priority] : Funcs) {
+      GlobalTy FunctionAddr(Name.str(), sizeof(void *), &GlobalPtrStart[Idx++]);
+      if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
+        return std::move(Err);
+    }
+
+    // Copy the created buffer to the appropriate symbols so the kernel can
+    // iterate through them.
+    GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
+                         sizeof(void *), &GlobalPtrStart);
+    if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
+      return std::move(Err);
+
+    GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
+                        sizeof(void *), &GlobalPtrStop);
+    if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
+      return std::move(Err);
+
+    // Launch the kernel to execute the functions in the buffer.
+    GenericKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
----------------
jdoerfert wrote:

Same as with AMD.

https://github.com/llvm/llvm-project/pull/71739