[Openmp-commits] [llvm] [clang] [openmp] [OpenMP] Rework handling of global ctor/dtors in OpenMP (PR #71739)
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Fri Nov 10 09:32:02 PST 2023
================
@@ -1038,6 +1048,109 @@ struct CUDADeviceTy : public GenericDeviceTy {
using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
+ Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
+ bool IsCtor) {
+ // Perform a quick check for the named kernel in the image. The kernel
+ // should be created by the 'nvptx-lower-ctor-dtor' pass.
+ GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
+ GlobalTy Global(IsCtor ? "nvptx$device$init" : "nvptx$device$fini",
+ sizeof(void *));
+ if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) {
+ consumeError(std::move(Err));
+ return Plugin::success();
+ }
+
+ // The Nvidia backend cannot handle creating the ctor / dtor array
+ // automatically so we must create it ourselves. The backend will emit
+ // several globals that contain function pointers we can call. These are
+ // prefixed with a known name due to Nvidia's lack of section support.
+ const ELF64LEObjectFile *ELFObj =
+ Handler.getOrCreateELFObjectFile(*this, Image);
+ if (!ELFObj)
+ return Plugin::error("Unable to create ELF object for image %p",
+ Image.getStart());
+
+ // Search for all symbols that contain a constructor or destructor.
+ SmallVector<std::pair<StringRef, uint16_t>> Funcs;
+ for (ELFSymbolRef Sym : ELFObj->symbols()) {
+ auto NameOrErr = Sym.getName();
+ if (!NameOrErr)
+ return NameOrErr.takeError();
+
+ if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
+ : "__fini_array_object_"))
+ continue;
+
+ uint16_t priority;
+ if (NameOrErr->rsplit('_').second.getAsInteger(10, priority))
+ return Plugin::error("Invalid priority for constructor or destructor");
+
+ Funcs.emplace_back(*NameOrErr, priority);
+ }
+
+ // Sort the created array to be in priority order.
+ llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; });
+
+ // Allocate a buffer to store all of the known constructor / destructor
+ // functions in so we can iterate them on the device.
+ void *Buffer =
+ allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_SHARED);
+ if (!Buffer)
+ return Plugin::error("Failed to allocate memory for global buffer");
+
+ auto *GlobalPtrStart = reinterpret_cast<uintptr_t *>(Buffer);
+ auto *GlobalPtrStop = reinterpret_cast<uintptr_t *>(Buffer) + Funcs.size();
+
+ std::size_t Idx = 0;
+ for (auto [Name, Priority] : Funcs) {
+ GlobalTy FunctionAddr(Name.str(), sizeof(void *), &GlobalPtrStart[Idx++]);
+ if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
+ return std::move(Err);
+ }
+
+ // Copy the created buffer to the appropriate symbols so the kernel can
+ // iterate through them.
+ GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
+ sizeof(void *), &GlobalPtrStart);
+ if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
+ return std::move(Err);
+
+ GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
+ sizeof(void *), &GlobalPtrStop);
+ if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
+ return std::move(Err);
+
+ // Launch the kernel to execute the functions in the buffer.
+ GenericKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
+ if (!CUDAKernel)
+ return Plugin::error("Failed to allocate memory for CUDA kernel");
+
+ new (CUDAKernel)
+ CUDAKernelTy(IsCtor ? "nvptx$device$init" : "nvptx$device$fini");
+
+ if (auto Err = CUDAKernel->init(*this, Image))
+ return std::move(Err);
+
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
+
+ if (auto Err = initAsyncInfoImpl(AsyncInfoWrapper))
----------------
jdoerfert wrote:
You shouldn't need this.
https://github.com/llvm/llvm-project/pull/71739
More information about the Openmp-commits
mailing list