[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

Joseph Huber via cfe-commits cfe-commits at lists.llvm.org
Wed Mar 27 11:47:31 PDT 2024


https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/86830

>From 875ed36029851a2423c97b28bd5bf34975efb016 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 27 Mar 2024 11:43:37 -0500
Subject: [PATCH 1/2] [Offload] Change unregister library to use `atexit`
 instead of destructor

Summary:
The 'new driver' sets up the lifetime of a registered liftime using
global constructors and destructors. Currently, this is put at priority
1 which isn't strictly conformant as it will conflict with system
utilities. We now use 101 as this is the loweest suggested for
non-system constructors and will still run before user constructors.

Secondly, there were issues with the CUDA runtime when destructed with a
global destructor. Because the global ones are in any order and
potentially run before other things we were hitting an edge case where
the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This
would result in us erroring when we call into a destroyed `libcuda.so`
instance. using `atexit` is what CUDA / HIP use and it prevents this
from happening. Most everything uses `atexit` except system utilities
and because of the constructor priority it will be unregistered *after*
everything else but not after `_fl_fini`.
---
 clang/test/Driver/linker-wrapper-image.c      |  8 +--
 .../Frontend/Offloading/OffloadWrapper.cpp    | 70 ++++++++++---------
 2 files changed, 41 insertions(+), 37 deletions(-)

diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c
index 75475264135224..d01445e3aed04e 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -26,11 +26,11 @@
 //      OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 8
 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
 // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
-// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
-// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
+// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }]
 
 //      OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" {
 // OPENMP-NEXT: entry:
+// OPENMP-NEXT:   %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
 // OPENMP-NEXT:   call void @__tgt_register_lib(ptr @.omp_offloading.descriptor)
 // OPENMP-NEXT:   ret void
 // OPENMP-NEXT: }
@@ -62,7 +62,7 @@
 // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
 
-// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
+// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }]
 
 //      CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" {
 // CUDA-NEXT: entry:
@@ -162,7 +162,7 @@
 // HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
 // HIP-NEXT: @.hip.binary_handle = internal global ptr null
 
-// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
+// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }]
 
 //      HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
 // HIP-NEXT: entry:
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index fec1bdbe9d8c74..86e8712ce95ae6 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs,
                             ".omp_offloading.descriptor" + Suffix);
 }
 
-void createRegisterFunction(Module &M, GlobalVariable *BinDesc,
-                            StringRef Suffix) {
+Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc,
+                                   StringRef Suffix) {
   LLVMContext &C = M.getContext();
   auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
-                                ".omp_offloading.descriptor_reg" + Suffix, &M);
+  auto *Func =
+      Function::Create(FuncTy, GlobalValue::InternalLinkage,
+                       ".omp_offloading.descriptor_unreg" + Suffix, &M);
   Func->setSection(".text.startup");
 
-  // Get __tgt_register_lib function declaration.
-  auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
-                                      /*isVarArg*/ false);
-  FunctionCallee RegFuncC =
-      M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+  // Get __tgt_unregister_lib function declaration.
+  auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+                                        /*isVarArg*/ false);
+  FunctionCallee UnRegFuncC =
+      M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
 
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-  Builder.CreateCall(RegFuncC, BinDesc);
+  Builder.CreateCall(UnRegFuncC, BinDesc);
   Builder.CreateRetVoid();
 
-  // Add this function to constructors.
-  // Set priority to 1 so that __tgt_register_lib is executed AFTER
-  // __tgt_register_requires (we want to know what requirements have been
-  // asked for before we load a libomptarget plugin so that by the time the
-  // plugin is loaded it can report how many devices there are which can
-  // satisfy these requirements).
-  appendToGlobalCtors(M, Func, /*Priority*/ 1);
+  return Func;
 }
 
-void createUnregisterFunction(Module &M, GlobalVariable *BinDesc,
-                              StringRef Suffix) {
+void createRegisterFunction(Module &M, GlobalVariable *BinDesc,
+                            StringRef Suffix) {
   LLVMContext &C = M.getContext();
   auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *Func =
-      Function::Create(FuncTy, GlobalValue::InternalLinkage,
-                       ".omp_offloading.descriptor_unreg" + Suffix, &M);
+  auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+                                ".omp_offloading.descriptor_reg" + Suffix, &M);
   Func->setSection(".text.startup");
 
-  // Get __tgt_unregister_lib function declaration.
-  auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
-                                        /*isVarArg*/ false);
-  FunctionCallee UnRegFuncC =
-      M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
+  // Get __tgt_register_lib function declaration.
+  auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+                                      /*isVarArg*/ false);
+  FunctionCallee RegFuncC =
+      M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+
+  auto *AtExitTy = FunctionType::get(
+      Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false);
+  FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy);
+
+  Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix);
 
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-  Builder.CreateCall(UnRegFuncC, BinDesc);
+
+  // Register the destructors with 'atexit', This is expected by the CUDA
+  // runtime and ensures that we clean up before dynamic objects are destroyed.
+  // This needs to be done before the runtime is called and registers its own.
+  Builder.CreateCall(AtExit, UnregFunc);
+
+  Builder.CreateCall(RegFuncC, BinDesc);
   Builder.CreateRetVoid();
 
-  // Add this function to global destructors.
-  // Match priority of __tgt_register_lib
-  appendToGlobalDtors(M, Func, /*Priority*/ 1);
+  // Add this function to constructors.
+  appendToGlobalCtors(M, Func, /*Priority=*/101);
 }
 
 // struct fatbin_wrapper {
@@ -578,7 +583,7 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc,
   DtorBuilder.CreateRetVoid();
 
   // Add this function to constructors.
-  appendToGlobalCtors(M, CtorFunc, /*Priority*/ 1);
+  appendToGlobalCtors(M, CtorFunc, /*Priority=*/101);
 }
 } // namespace
 
@@ -591,7 +596,6 @@ Error offloading::wrapOpenMPBinaries(Module &M, ArrayRef<ArrayRef<char>> Images,
     return createStringError(inconvertibleErrorCode(),
                              "No binary descriptors created.");
   createRegisterFunction(M, Desc, Suffix);
-  createUnregisterFunction(M, Desc, Suffix);
   return Error::success();
 }
 

>From 568fdf842b138db71af4d959142de4d1c7a8e0eb Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 27 Mar 2024 13:47:25 -0500
Subject: [PATCH 2/2] Update llvm/lib/Frontend/Offloading/OffloadWrapper.cpp

---
 llvm/lib/Frontend/Offloading/OffloadWrapper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index 86e8712ce95ae6..7241d15ed1c670 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -232,7 +232,7 @@ void createRegisterFunction(Module &M, GlobalVariable *BinDesc,
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
 
-  // Register the destructors with 'atexit', This is expected by the CUDA
+  // Register the destructors with 'atexit'. This is expected by the CUDA
   // runtime and ensures that we clean up before dynamic objects are destroyed.
   // This needs to be done before the runtime is called and registers its own.
   Builder.CreateCall(AtExit, UnregFunc);



More information about the cfe-commits mailing list