[llvm-branch-commits] [openmp] 207f96e - [Libomptarget] Deinitialize AMDGPU global state more intentionally

Tom Stellard via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Aug 8 11:01:53 PDT 2022


Author: Joseph Huber
Date: 2022-08-08T11:00:41-07:00
New Revision: 207f96e8fac08db23bd2a8edc700e162f135ef62

URL: https://github.com/llvm/llvm-project/commit/207f96e8fac08db23bd2a8edc700e162f135ef62
DIFF: https://github.com/llvm/llvm-project/commit/207f96e8fac08db23bd2a8edc700e162f135ef62.diff

LOG: [Libomptarget] Deinitialize AMDGPU global state more intentionally

A previous patch made the destruction of the HSA plugin more
deterministic. However, there were still other global values that are not
handled this way. When attempting to call a destructor kernel, the
device would have already been uninitialized and we could not find the
appropriate kernel to call. This is because they were stored in global
containers that had their destructors called already. Merges this global
state into the rest of the info state by putting those global values
inside of the global pointer already allocated and deallocated by the
constructor and destructor. This should allow the AMDGPU plugin to
correctly identify the destructors if we were to run them.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D131011

(cherry picked from commit 2b7203a35972e98b8521f92d2791043dc539ae88)

Added: 
    

Modified: 
    openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 3eb1d0ffa6304..6b95756f1d302 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -210,9 +210,6 @@ struct KernelArgPool {
 };
 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
 
-std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
-    KernelArgPoolMap;
-
 /// Use a single entity to encode a kernel and a set of flags
 struct KernelTy {
   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
@@ -224,7 +221,9 @@ struct KernelTy {
   KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
            int32_t DeviceId, void *CallStackAddr, const char *Name,
            uint32_t KernargSegmentSize,
-           hsa_amd_memory_pool_t &KernArgMemoryPool)
+           hsa_amd_memory_pool_t &KernArgMemoryPool,
+           std::unordered_map<std::string, std::unique_ptr<KernelArgPool>>
+               &KernelArgPoolMap)
       : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
         DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
@@ -238,10 +237,6 @@ struct KernelTy {
   }
 };
 
-/// List that contains all the kernels.
-/// FIXME: we may need this to be per device and per library.
-std::list<KernelTy> KernelsList;
-
 template <typename Callback> static hsa_status_t findAgents(Callback CB) {
 
   hsa_status_t Err =
@@ -456,6 +451,12 @@ class RTLDeviceInfoTy : HSALifetime {
 
   int NumberOfDevices = 0;
 
+  /// List that contains all the kernels.
+  /// FIXME: we may need this to be per device and per library.
+  std::list<KernelTy> KernelsList;
+  std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
+      KernelArgPoolMap;
+
   // GPU devices
   std::vector<hsa_agent_t> HSAAgents;
   std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
@@ -857,7 +858,6 @@ class RTLDeviceInfoTy : HSALifetime {
            "Unexpected device id!");
     FuncGblEntries[DeviceId].emplace_back();
     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-    // KernelArgPoolMap.clear();
     E.Entries.clear();
     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
   }
@@ -1116,19 +1116,6 @@ pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
 static RTLDeviceInfoTy *DeviceInfoState = nullptr;
 static RTLDeviceInfoTy &DeviceInfo() { return *DeviceInfoState; }
 
-int32_t __tgt_rtl_init_plugin() {
-  DeviceInfoState = new RTLDeviceInfoTy;
-  return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded)
-             ? OFFLOAD_SUCCESS
-             : OFFLOAD_FAIL;
-}
-
-int32_t __tgt_rtl_deinit_plugin() {
-  if (DeviceInfoState)
-    delete DeviceInfoState;
-  return OFFLOAD_SUCCESS;
-}
-
 namespace {
 
 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
@@ -1171,7 +1158,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
      (long long unsigned)(Elf64_Addr)HstPtr,
      (long long unsigned)(Elf64_Addr)TgtPtr);
   Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
-                                           DeviceId);
+                                             DeviceId);
   if (Err != HSA_STATUS_SUCCESS) {
     DP("Error when copying data from host to device. Pointers: "
        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
@@ -1468,8 +1455,9 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
     KernelArgPool *ArgPool = nullptr;
     void *KernArg = nullptr;
     {
-      auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
-      if (It != KernelArgPoolMap.end()) {
+      auto It =
+          DeviceInfo().KernelArgPoolMap.find(std::string(KernelInfo->Name));
+      if (It != DeviceInfo().KernelArgPoolMap.end()) {
         ArgPool = (It->second).get();
       }
     }
@@ -2031,6 +2019,20 @@ bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) {
 }
 
 extern "C" {
+
+int32_t __tgt_rtl_init_plugin() {
+  DeviceInfoState = new RTLDeviceInfoTy;
+  return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded)
+             ? OFFLOAD_SUCCESS
+             : OFFLOAD_FAIL;
+}
+
+int32_t __tgt_rtl_deinit_plugin() {
+  if (DeviceInfoState)
+    delete DeviceInfoState;
+  return OFFLOAD_SUCCESS;
+}
+
 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
   return elfMachineIdIsAmdgcn(Image);
 }
@@ -2373,8 +2375,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
         }
 
         // write ptr to device memory so it can be used by later kernels
-        Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *),
-                                                 DeviceId);
+        Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
+                                                   sizeof(void *), DeviceId);
         if (Err != HSA_STATUS_SUCCESS) {
           DP("memcpy install of state_ptr failed\n");
           return NULL;
@@ -2437,8 +2439,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
         // If unified memory is present any target link variables
         // can access host addresses directly. There is no longer a
         // need for device copies.
-        Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
-                                                 sizeof(void *), DeviceId);
+        Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *),
+                                                 DeviceId);
         if (Err != HSA_STATUS_SUCCESS)
           DP("Error when copying USM\n");
         DP("Copy linked variable host address (" DPxMOD ")"
@@ -2598,11 +2600,12 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
     }
     check("Loading computation property", Err);
 
-    KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
-                                   CallStackAddr, E->name, KernargSegmentSize,
-                                   DeviceInfo().KernArgPool));
+    DeviceInfo().KernelsList.push_back(
+        KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name,
+                 KernargSegmentSize, DeviceInfo().KernArgPool,
+                 DeviceInfo().KernelArgPoolMap));
     __tgt_offload_entry Entry = *E;
-    Entry.addr = (void *)&KernelsList.back();
+    Entry.addr = (void *)&DeviceInfo().KernelsList.back();
     DeviceInfo().addOffloadEntry(DeviceId, Entry);
     DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
   }


        


More information about the llvm-branch-commits mailing list