[llvm-branch-commits] [openmp] b5151c3 - [openmp][amdgpu] Move global DeviceInfo behind call syntax prior to using D130712

Tom Stellard via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Aug 8 11:01:48 PDT 2022


Author: Jon Chesterfield
Date: 2022-08-08T11:00:41-07:00
New Revision: b5151c32f9aab3e9c61e622db9ec671dbd2331d9

URL: https://github.com/llvm/llvm-project/commit/b5151c32f9aab3e9c61e622db9ec671dbd2331d9
DIFF: https://github.com/llvm/llvm-project/commit/b5151c32f9aab3e9c61e622db9ec671dbd2331d9.diff

LOG: [openmp][amdgpu] Move global DeviceInfo behind call syntax prior to using D130712

(cherry picked from commit 75aa52106452a1d15ca487af7b408a812012e133)

Added: 
    

Modified: 
    openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 9b4bf41418611..75529863c6aa6 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -1113,14 +1113,17 @@ class RTLDeviceInfoTy : HSALifetime {
 
 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
 
-static RTLDeviceInfoTy DeviceInfo;
+// Putting accesses to DeviceInfo global behind a function call prior
+// to changing to use init_plugin/deinit_plugin calls
+static RTLDeviceInfoTy DeviceInfoState;
+RTLDeviceInfoTy& DeviceInfo() { return DeviceInfoState; }
 
 namespace {
 
 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
                      __tgt_async_info *AsyncInfo) {
   assert(AsyncInfo && "AsyncInfo is nullptr");
-  assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
   // Return success if we are not copying back to host from target.
   if (!HstPtr)
     return OFFLOAD_SUCCESS;
@@ -1129,7 +1132,7 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
      (long long unsigned)(Elf64_Addr)TgtPtr,
      (long long unsigned)(Elf64_Addr)HstPtr);
 
-  Err = DeviceInfo.freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
+  Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
                                            DeviceId);
 
   if (Err != HSA_STATUS_SUCCESS) {
@@ -1148,7 +1151,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
                    __tgt_async_info *AsyncInfo) {
   assert(AsyncInfo && "AsyncInfo is nullptr");
   hsa_status_t Err;
-  assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
   // Return success if we are not doing host to target.
   if (!HstPtr)
     return OFFLOAD_SUCCESS;
@@ -1156,7 +1159,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
      (long long unsigned)(Elf64_Addr)HstPtr,
      (long long unsigned)(Elf64_Addr)TgtPtr);
-  Err = DeviceInfo.freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
+  Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
                                            DeviceId);
   if (Err != HSA_STATUS_SUCCESS) {
     DP("Error when copying data from host to device. Pointers: "
@@ -1377,7 +1380,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
   KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
 
   std::string KernelName = std::string(KernelInfo->Name);
-  auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
+  auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
   if (KernelInfoTable[DeviceId].find(KernelName) ==
       KernelInfoTable[DeviceId].end()) {
     DP("Kernel %s not found\n", KernelName.c_str());
@@ -1387,7 +1390,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
   const atl_kernel_info_t KernelInfoEntry =
       KernelInfoTable[DeviceId][KernelName];
   const uint32_t GroupSegmentSize =
-      KernelInfoEntry.group_segment_size + DeviceInfo.Env.DynamicMemSize;
+      KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
   const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
   const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
   const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
@@ -1399,12 +1402,12 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
    * Set limit based on ThreadsPerGroup and GroupsPerDevice
    */
   LaunchVals LV =
-      getLaunchVals(DeviceInfo.WarpSize[DeviceId], DeviceInfo.Env,
+      getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
                     KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
                     NumTeams,      // From run_region arg
                     ThreadLimit,   // From run_region arg
                     LoopTripcount, // From run_region arg
-                    DeviceInfo.NumTeams[KernelInfo->DeviceId]);
+                    DeviceInfo().NumTeams[KernelInfo->DeviceId]);
   const int GridSize = LV.GridSize;
   const int WorkgroupSize = LV.WorkgroupSize;
 
@@ -1425,7 +1428,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
 
   // Run on the device.
   {
-    hsa_queue_t *Queue = DeviceInfo.HSAQueueSchedulers[DeviceId].next();
+    hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
     if (!Queue) {
       return OFFLOAD_FAIL;
     }
@@ -1488,12 +1491,12 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
       ImplArgs->offset_z = 0;
 
       // assign a hostcall buffer for the selected Q
-      if (__atomic_load_n(&DeviceInfo.HostcallRequired, __ATOMIC_ACQUIRE)) {
+      if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) {
         // hostrpc_assign_buffer is not thread safe, and this function is
         // under a multiple reader lock, not a writer lock.
         static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER;
         pthread_mutex_lock(&HostcallInitLock);
-        uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo.HSAAgents[DeviceId],
+        uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo().HSAAgents[DeviceId],
                                                 Queue, DeviceId);
         pthread_mutex_unlock(&HostcallInitLock);
         if (!Buffer) {
@@ -1527,7 +1530,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
       Packet->kernarg_address = KernArg;
     }
 
-    hsa_signal_t S = DeviceInfo.FreeSignalPool.pop();
+    hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
     if (S.handle == 0) {
       DP("Failed to get signal instance\n");
       return OFFLOAD_FAIL;
@@ -1549,7 +1552,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
 
     assert(ArgPool);
     ArgPool->deallocate(KernArg);
-    DeviceInfo.FreeSignalPool.push(S);
+    DeviceInfo().FreeSignalPool.push(S);
   }
 
   DP("Kernel completed\n");
@@ -1743,7 +1746,7 @@ hsa_status_t moduleRegisterFromMemoryToPlace(
   };
   return core::RegisterModuleFromMemory(
       KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
-      DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
+      DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
       HSAExecutables);
 }
 
@@ -1839,7 +1842,7 @@ struct DeviceEnvironment {
         DP("Setting global device environment after load (%u bytes)\n",
            SI.Size);
         int DeviceId = HostDeviceEnv.DeviceNum;
-        auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId];
+        auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
         void *StatePtr;
         uint32_t StatePtrSize;
         hsa_status_t Err = interop_hsa_get_symbol_info(
@@ -1855,7 +1858,7 @@ struct DeviceEnvironment {
           return HSA_STATUS_ERROR;
         }
 
-        return DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
+        return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
                                                   StatePtrSize, DeviceId);
       }
     }
@@ -1866,7 +1869,7 @@ struct DeviceEnvironment {
 hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
   uint64_t Rounded = 4 * ((Size + 3) / 4);
   void *Ptr;
-  hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
+  hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
   hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
   if (Err != HSA_STATUS_SUCCESS) {
     return Err;
@@ -1893,8 +1896,8 @@ bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
 
 namespace core {
 hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
-  return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(),
-                                     &DeviceInfo.HSAAgents[0], NULL, Ptr);
+  return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
+                                     &DeviceInfo().HSAAgents[0], NULL, Ptr);
 }
 } // namespace core
 
@@ -1916,7 +1919,7 @@ static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
 
   auto TripleTargetID = llvm::StringRef(TargetID);
   if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
-    DeviceInfo.TargetID.push_back(TripleTargetID.ltrim('-').str());
+    DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
   }
   return HSA_STATUS_SUCCESS;
 }
@@ -2034,13 +2037,13 @@ int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
 
   for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
     __tgt_rtl_init_device(DeviceId);
-    hsa_agent_t agent = DeviceInfo.HSAAgents[DeviceId];
+    hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
     hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
     if (err != HSA_STATUS_SUCCESS) {
       DP("Error iterating ISAs\n");
       return false;
     }
-    if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo.TargetID[DeviceId]))
+    if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo().TargetID[DeviceId]))
       return false;
   }
   DP("Image has Target ID compatible with the current environment: %s\n",
@@ -2053,8 +2056,8 @@ int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; }
 
 int __tgt_rtl_number_of_devices() {
   // If the construction failed, no methods are safe to call
-  if (DeviceInfo.ConstructionSucceeded) {
-    return DeviceInfo.NumberOfDevices;
+  if (DeviceInfo().ConstructionSucceeded) {
+    return DeviceInfo().NumberOfDevices;
   }
   DP("AMDGPU plugin construction failed. Zero devices available\n");
   return 0;
@@ -2062,7 +2065,7 @@ int __tgt_rtl_number_of_devices() {
 
 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
   DP("Init requires flags to %ld\n", RequiresFlags);
-  DeviceInfo.RequiresFlags = RequiresFlags;
+  DeviceInfo().RequiresFlags = RequiresFlags;
   return RequiresFlags;
 }
 
@@ -2075,7 +2078,7 @@ int32_t __tgt_rtl_init_device(int DeviceId) {
   // this is per device id init
   DP("Initialize the device id: %d\n", DeviceId);
 
-  hsa_agent_t Agent = DeviceInfo.HSAAgents[DeviceId];
+  hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
 
   // Get number of Compute Unit
   uint32_t ComputeUnits = 0;
@@ -2083,39 +2086,39 @@ int32_t __tgt_rtl_init_device(int DeviceId) {
       Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
       &ComputeUnits);
   if (Err != HSA_STATUS_SUCCESS) {
-    DeviceInfo.ComputeUnits[DeviceId] = 1;
+    DeviceInfo().ComputeUnits[DeviceId] = 1;
     DP("Error getting compute units : settiing to 1\n");
   } else {
-    DeviceInfo.ComputeUnits[DeviceId] = ComputeUnits;
-    DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[DeviceId]);
+    DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
+    DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]);
   }
 
   char GetInfoName[64]; // 64 max size returned by get info
   Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
                            (void *)GetInfoName);
   if (Err)
-    DeviceInfo.GPUName[DeviceId] = "--unknown gpu--";
+    DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
   else {
-    DeviceInfo.GPUName[DeviceId] = GetInfoName;
+    DeviceInfo().GPUName[DeviceId] = GetInfoName;
   }
 
   if (print_kernel_trace & STARTUP_DETAILS)
     DP("Device#%-2d CU's: %2d %s\n", DeviceId,
-       DeviceInfo.ComputeUnits[DeviceId], DeviceInfo.GPUName[DeviceId].c_str());
+       DeviceInfo().ComputeUnits[DeviceId], DeviceInfo().GPUName[DeviceId].c_str());
 
   // Query attributes to determine number of threads/block and blocks/grid.
   uint16_t WorkgroupMaxDim[3];
   Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
                            &WorkgroupMaxDim);
   if (Err != HSA_STATUS_SUCCESS) {
-    DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
+    DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
     DP("Error getting grid dims: num groups : %d\n",
        RTLDeviceInfoTy::DefaultNumTeams);
   } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
-    DeviceInfo.GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
-    DP("Using %d ROCm blocks per grid\n", DeviceInfo.GroupsPerDevice[DeviceId]);
+    DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
+    DP("Using %d ROCm blocks per grid\n", DeviceInfo().GroupsPerDevice[DeviceId]);
   } else {
-    DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
+    DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
     DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
        "at the hard limit\n",
        WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit);
@@ -2125,22 +2128,22 @@ int32_t __tgt_rtl_init_device(int DeviceId) {
   hsa_dim3_t GridMaxDim;
   Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
   if (Err == HSA_STATUS_SUCCESS) {
-    DeviceInfo.ThreadsPerGroup[DeviceId] =
+    DeviceInfo().ThreadsPerGroup[DeviceId] =
         reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
-        DeviceInfo.GroupsPerDevice[DeviceId];
+        DeviceInfo().GroupsPerDevice[DeviceId];
 
-    if (DeviceInfo.ThreadsPerGroup[DeviceId] == 0) {
-      DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
+    if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
+      DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
       DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
-    } else if (enforceUpperBound(&DeviceInfo.ThreadsPerGroup[DeviceId],
+    } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
                                  RTLDeviceInfoTy::MaxWgSize)) {
       DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
     } else {
       DP("Using ROCm Queried thread limit: %d\n",
-         DeviceInfo.ThreadsPerGroup[DeviceId]);
+         DeviceInfo().ThreadsPerGroup[DeviceId]);
     }
   } else {
-    DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
+    DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
     DP("Error getting max block dimension, use default:%d \n",
        RTLDeviceInfoTy::MaxWgSize);
   }
@@ -2151,27 +2154,27 @@ int32_t __tgt_rtl_init_device(int DeviceId) {
       hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
   if (Err == HSA_STATUS_SUCCESS) {
     DP("Queried wavefront size: %d\n", WavefrontSize);
-    DeviceInfo.WarpSize[DeviceId] = WavefrontSize;
+    DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
   } else {
     // TODO: Burn the wavefront size into the code object
     DP("Warning: Unknown wavefront size, assuming 64\n");
-    DeviceInfo.WarpSize[DeviceId] = 64;
+    DeviceInfo().WarpSize[DeviceId] = 64;
   }
 
   // Adjust teams to the env variables
 
-  if (DeviceInfo.Env.TeamLimit > 0 &&
-      (enforceUpperBound(&DeviceInfo.GroupsPerDevice[DeviceId],
-                         DeviceInfo.Env.TeamLimit))) {
+  if (DeviceInfo().Env.TeamLimit > 0 &&
+      (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
+                         DeviceInfo().Env.TeamLimit))) {
     DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
-       DeviceInfo.Env.TeamLimit);
+       DeviceInfo().Env.TeamLimit);
   }
 
   // Set default number of teams
-  if (DeviceInfo.Env.NumTeams > 0) {
-    DeviceInfo.NumTeams[DeviceId] = DeviceInfo.Env.NumTeams;
+  if (DeviceInfo().Env.NumTeams > 0) {
+    DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
     DP("Default number of teams set according to environment %d\n",
-       DeviceInfo.Env.NumTeams);
+       DeviceInfo().Env.NumTeams);
   } else {
     char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
     int TeamsPerCU = DefaultTeamsPerCU;
@@ -2179,45 +2182,45 @@ int32_t __tgt_rtl_init_device(int DeviceId) {
       TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
     }
 
-    DeviceInfo.NumTeams[DeviceId] =
-        TeamsPerCU * DeviceInfo.ComputeUnits[DeviceId];
+    DeviceInfo().NumTeams[DeviceId] =
+        TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
     DP("Default number of teams = %d * number of compute units %d\n",
-       TeamsPerCU, DeviceInfo.ComputeUnits[DeviceId]);
+       TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]);
   }
 
-  if (enforceUpperBound(&DeviceInfo.NumTeams[DeviceId],
-                        DeviceInfo.GroupsPerDevice[DeviceId])) {
+  if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
+                        DeviceInfo().GroupsPerDevice[DeviceId])) {
     DP("Default number of teams exceeds device limit, capping at %d\n",
-       DeviceInfo.GroupsPerDevice[DeviceId]);
+       DeviceInfo().GroupsPerDevice[DeviceId]);
   }
 
   // Adjust threads to the env variables
-  if (DeviceInfo.Env.TeamThreadLimit > 0 &&
-      (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId],
-                         DeviceInfo.Env.TeamThreadLimit))) {
+  if (DeviceInfo().Env.TeamThreadLimit > 0 &&
+      (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
+                         DeviceInfo().Env.TeamThreadLimit))) {
     DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
-       DeviceInfo.Env.TeamThreadLimit);
+       DeviceInfo().Env.TeamThreadLimit);
   }
 
   // Set default number of threads
-  DeviceInfo.NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
+  DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
   DP("Default number of threads set according to library's default %d\n",
      RTLDeviceInfoTy::DefaultWgSize);
-  if (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId],
-                        DeviceInfo.ThreadsPerGroup[DeviceId])) {
+  if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
+                        DeviceInfo().ThreadsPerGroup[DeviceId])) {
     DP("Default number of threads exceeds device limit, capping at %d\n",
-       DeviceInfo.ThreadsPerGroup[DeviceId]);
+       DeviceInfo().ThreadsPerGroup[DeviceId]);
   }
 
   DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
-     DeviceId, DeviceInfo.GroupsPerDevice[DeviceId],
-     DeviceInfo.ThreadsPerGroup[DeviceId]);
+     DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],
+     DeviceInfo().ThreadsPerGroup[DeviceId]);
 
   DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,
-     DeviceInfo.WarpSize[DeviceId], DeviceInfo.ThreadsPerGroup[DeviceId],
-     DeviceInfo.GroupsPerDevice[DeviceId],
-     DeviceInfo.GroupsPerDevice[DeviceId] *
-         DeviceInfo.ThreadsPerGroup[DeviceId]);
+     DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],
+     DeviceInfo().GroupsPerDevice[DeviceId],
+     DeviceInfo().GroupsPerDevice[DeviceId] *
+         DeviceInfo().ThreadsPerGroup[DeviceId]);
 
   return OFFLOAD_SUCCESS;
 }
@@ -2227,9 +2230,9 @@ __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
 
 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
                                           __tgt_device_image *Image) {
-  DeviceInfo.LoadRunLock.lock();
+  DeviceInfo().LoadRunLock.lock();
   __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
-  DeviceInfo.LoadRunLock.unlock();
+  DeviceInfo().LoadRunLock.unlock();
   return Res;
 }
 
@@ -2259,7 +2262,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
 
   const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
 
-  DeviceInfo.clearOffloadEntriesTable(DeviceId);
+  DeviceInfo().clearOffloadEntriesTable(DeviceId);
 
   // We do not need to set the ELF version because the caller of this function
   // had to do that to decide the right runtime to use
@@ -2268,25 +2271,25 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
     return NULL;
 
   {
-    auto Env = DeviceEnvironment(DeviceId, DeviceInfo.NumberOfDevices,
-                                 DeviceInfo.Env.DynamicMemSize, Image, ImgSize);
+    auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
+                                 DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
 
-    auto &KernelInfo = DeviceInfo.KernelInfoTable[DeviceId];
-    auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId];
+    auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
+    auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
     hsa_status_t Err = moduleRegisterFromMemoryToPlace(
         KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
         [&](void *Data, size_t Size) {
           if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
-            __atomic_store_n(&DeviceInfo.HostcallRequired, true,
+            __atomic_store_n(&DeviceInfo().HostcallRequired, true,
                              __ATOMIC_RELEASE);
           }
           return Env.beforeLoading(Data, Size);
         },
-        DeviceInfo.HSAExecutables);
+        DeviceInfo().HSAExecutables);
 
     check("Module registering", Err);
     if (Err != HSA_STATUS_SUCCESS) {
-      const char *DeviceName = DeviceInfo.GPUName[DeviceId].c_str();
+      const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
       const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
 
       if (strcmp(DeviceName, ElfName) != 0) {
@@ -2315,7 +2318,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
 
     void *StatePtr;
     uint32_t StatePtrSize;
-    auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId];
+    auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
     hsa_status_t Err = interop_hsa_get_symbol_info(
         SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
         &StatePtrSize);
@@ -2340,7 +2343,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
           return NULL;
         }
 
-        auto &DSS = DeviceInfo.DeviceStateStore[DeviceId];
+        auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
         if (DSS.first.get() == nullptr) {
           assert(DSS.second == 0);
           void *Ptr = NULL;
@@ -2362,7 +2365,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
         }
 
         // write ptr to device memory so it can be used by later kernels
-        Err = DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *),
+        Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *),
                                                  DeviceId);
         if (Err != HSA_STATUS_SUCCESS) {
           DP("memcpy install of state_ptr failed\n");
@@ -2399,7 +2402,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
       void *Varptr;
       uint32_t Varsize;
 
-      auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId];
+      auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
       hsa_status_t Err = interop_hsa_get_symbol_info(
           SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
 
@@ -2419,14 +2422,14 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
          DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr));
       Entry.addr = (void *)Varptr;
 
-      DeviceInfo.addOffloadEntry(DeviceId, Entry);
+      DeviceInfo().addOffloadEntry(DeviceId, Entry);
 
-      if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+      if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
           E->flags & OMP_DECLARE_TARGET_LINK) {
         // If unified memory is present any target link variables
         // can access host addresses directly. There is no longer a
         // need for device copies.
-        Err = DeviceInfo.freesignalpoolMemcpyH2D(Varptr, E->addr,
+        Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
                                                  sizeof(void *), DeviceId);
         if (Err != HSA_STATUS_SUCCESS)
           DP("Error when copying USM\n");
@@ -2442,7 +2445,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
 
     // errors in kernarg_segment_size previously treated as = 0 (or as undef)
     uint32_t KernargSegmentSize = 0;
-    auto &KernelInfoMap = DeviceInfo.KernelInfoTable[DeviceId];
+    auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
     hsa_status_t Err = HSA_STATUS_SUCCESS;
     if (!E->name) {
       Err = HSA_STATUS_ERROR;
@@ -2589,19 +2592,19 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
 
     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
                                    CallStackAddr, E->name, KernargSegmentSize,
-                                   DeviceInfo.KernArgPool));
+                                   DeviceInfo().KernArgPool));
     __tgt_offload_entry Entry = *E;
     Entry.addr = (void *)&KernelsList.back();
-    DeviceInfo.addOffloadEntry(DeviceId, Entry);
+    DeviceInfo().addOffloadEntry(DeviceId, Entry);
     DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
   }
 
-  return DeviceInfo.getOffloadEntriesTable(DeviceId);
+  return DeviceInfo().getOffloadEntriesTable(DeviceId);
 }
 
 void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
   void *Ptr = NULL;
-  assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
 
   if (Kind != TARGET_ALLOC_DEFAULT) {
     REPORT("Invalid target data allocation kind or requested allocator not "
@@ -2609,7 +2612,7 @@ void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
     return NULL;
   }
 
-  hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
+  hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
   hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
   DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,
      (long long unsigned)(Elf64_Addr)Ptr);
@@ -2619,7 +2622,7 @@ void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
 
 int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
                               int64_t Size) {
-  assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
   __tgt_async_info AsyncInfo;
   int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
   if (Rc != OFFLOAD_SUCCESS)
@@ -2630,7 +2633,7 @@ int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
 
 int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
                                     int64_t Size, __tgt_async_info *AsyncInfo) {
-  assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
   if (AsyncInfo) {
     initAsyncInfo(AsyncInfo);
     return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
@@ -2640,7 +2643,7 @@ int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
 
 int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
                                 int64_t Size) {
-  assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
   __tgt_async_info AsyncInfo;
   int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
   if (Rc != OFFLOAD_SUCCESS)
@@ -2653,13 +2656,13 @@ int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
                                       int64_t Size,
                                       __tgt_async_info *AsyncInfo) {
   assert(AsyncInfo && "AsyncInfo is nullptr");
-  assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
   initAsyncInfo(AsyncInfo);
   return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
 }
 
 int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) {
-  assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
   hsa_status_t Err;
   DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr);
   Err = core::Runtime::Memfree(TgtPtr);
@@ -2676,11 +2679,11 @@ int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
                                          int32_t ThreadLimit,
                                          uint64_t LoopTripcount) {
 
-  DeviceInfo.LoadRunLock.lock_shared();
+  DeviceInfo().LoadRunLock.lock_shared();
   int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
                                 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
 
-  DeviceInfo.LoadRunLock.unlock_shared();
+  DeviceInfo().LoadRunLock.unlock_shared();
   return Res;
 }
 
@@ -2703,11 +2706,11 @@ int32_t __tgt_rtl_run_target_team_region_async(
   assert(AsyncInfo && "AsyncInfo is nullptr");
   initAsyncInfo(AsyncInfo);
 
-  DeviceInfo.LoadRunLock.lock_shared();
+  DeviceInfo().LoadRunLock.lock_shared();
   int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
                                 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
 
-  DeviceInfo.LoadRunLock.unlock_shared();
+  DeviceInfo().LoadRunLock.unlock_shared();
   return Res;
 }
 
@@ -2740,7 +2743,7 @@ void __tgt_rtl_print_device_info(int32_t DeviceId) {
   // TODO: Assertion to see if DeviceId is correct
   // NOTE: We don't need to set context for print device info.
 
-  DeviceInfo.printDeviceInfo(DeviceId, DeviceInfo.HSAAgents[DeviceId]);
+  DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
 }
 
 } // extern "C"


        


More information about the llvm-branch-commits mailing list