[Openmp-commits] [openmp] 8218eee - [OpenMP] Refactored the function `target`

Shilei Tian via Openmp-commits openmp-commits at lists.llvm.org
Thu Jul 30 18:06:01 PDT 2020


Author: Shilei Tian
Date: 2020-07-30T21:05:55-04:00
New Revision: 8218eee269c382472b9809cb3cce7a98eed7a31b

URL: https://github.com/llvm/llvm-project/commit/8218eee269c382472b9809cb3cce7a98eed7a31b
DIFF: https://github.com/llvm/llvm-project/commit/8218eee269c382472b9809cb3cce7a98eed7a31b.diff

LOG: [OpenMP] Refactored the function `target`

Refactored the function `target` to make preparation for fixing the
issue of ahead-of-time device memory deallocation.

Reviewed By: ye-luo

Differential Revision: https://reviews.llvm.org/D84816

Added: 
    

Modified: 
    openmp/libomptarget/src/omptarget.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 9c7ab6309503..ece7df6f15ac 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -714,85 +714,82 @@ static bool isLambdaMapping(int64_t Mapping) {
   return (Mapping & LambdaMapping) == LambdaMapping;
 }
 
-/// performs the same actions as data_begin in case arg_num is
-/// non-zero and initiates run of the offloaded region on the target platform;
-/// if arg_num is non-zero after the region execution is done it also
-/// performs the same action as data_update and data_end above. This function
-/// returns 0 if it was able to transfer the execution to a target and an
-/// integer 
diff erent from zero otherwise.
-int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases,
-           void **Args, int64_t *ArgSizes, int64_t *ArgTypes, void **ArgMappers,
-           int32_t TeamNum, int32_t ThreadLimit, int IsTeamConstruct) {
-  DeviceTy &Device = Devices[DeviceId];
-
-  // Find the table information in the map or look it up in the translation
-  // tables.
-  TableMap *TM = 0;
-  {
-    std::lock_guard<std::mutex> TblMapLock(*TblMapMtx);
-    HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap->find(HostPtr);
-    if (TableMapIt == HostPtrToTableMap->end()) {
-      // We don't have a map. So search all the registered libraries.
-      std::lock_guard<std::mutex> TrlTblLock(*TrlTblMtx);
-      for (HostEntriesBeginToTransTableTy::iterator
-               II = HostEntriesBeginToTransTable->begin(),
-               IE = HostEntriesBeginToTransTable->end();
-           !TM && II != IE; ++II) {
-        // get the translation table (which contains all the good info).
-        TranslationTable *TransTable = &II->second;
-        // iterate over all the host table entries to see if we can locate the
-        // host_ptr.
-        __tgt_offload_entry *Begin = TransTable->HostTable.EntriesBegin;
-        __tgt_offload_entry *End = TransTable->HostTable.EntriesEnd;
-        __tgt_offload_entry *Cur = Begin;
-        for (uint32_t I = 0; Cur < End; ++Cur, ++I) {
-          if (Cur->addr != HostPtr)
-            continue;
-          // we got a match, now fill the HostPtrToTableMap so that we
-          // may avoid this search next time.
-          TM = &(*HostPtrToTableMap)[HostPtr];
-          TM->Table = TransTable;
-          TM->Index = I;
-          break;
-        }
-      }
-    } else {
-      TM = &TableMapIt->second;
+namespace {
+/// Find the table information in the map or look it up in the translation
+/// tables.
+TableMap *getTableMap(void *HostPtr) {
+  std::lock_guard<std::mutex> TblMapLock(*TblMapMtx);
+  HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap->find(HostPtr);
+
+  if (TableMapIt != HostPtrToTableMap->end())
+    return &TableMapIt->second;
+
+  // We don't have a map. So search all the registered libraries.
+  TableMap *TM = nullptr;
+  std::lock_guard<std::mutex> TrlTblLock(*TrlTblMtx);
+  for (HostEntriesBeginToTransTableTy::iterator Itr =
+           HostEntriesBeginToTransTable->begin();
+       Itr != HostEntriesBeginToTransTable->end(); ++Itr) {
+    // get the translation table (which contains all the good info).
+    TranslationTable *TransTable = &Itr->second;
+    // iterate over all the host table entries to see if we can locate the
+    // host_ptr.
+    __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
+    for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
+      if (Cur->addr != HostPtr)
+        continue;
+      // we got a match, now fill the HostPtrToTableMap so that we
+      // may avoid this search next time.
+      TM = &(*HostPtrToTableMap)[HostPtr];
+      TM->Table = TransTable;
+      TM->Index = I;
+      return TM;
     }
   }
 
-  // No map for this host pointer found!
-  if (!TM) {
-    DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
-       DPxPTR(HostPtr));
-    return OFFLOAD_FAIL;
-  }
+  return nullptr;
+}
+
+/// Get loop trip count
+/// FIXME: This function will not work right if calling
+/// __kmpc_push_target_tripcount in one thread but doing offloading in another
+/// thread, which might occur when we call task yield.
+uint64_t getLoopTripCount(int64_t DeviceId) {
+  DeviceTy &Device = Devices[DeviceId];
+  uint64_t LoopTripCount = 0;
 
-  // get target table.
-  __tgt_target_table *TargetTable = nullptr;
   {
-    std::lock_guard<std::mutex> TrlTblLock(*TrlTblMtx);
-    assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
-           "Not expecting a device ID outside the table's bounds!");
-    TargetTable = TM->Table->TargetsTable[DeviceId];
+    std::lock_guard<std::mutex> TblMapLock(*TblMapMtx);
+    auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
+    if (I != Device.LoopTripCnt.end()) {
+      LoopTripCount = I->second;
+      Device.LoopTripCnt.erase(I);
+      DP("loop trip count is %lu.\n", LoopTripCount);
+    }
   }
-  assert(TargetTable && "Global data has not been mapped\n");
 
-  __tgt_async_info AsyncInfo;
+  return LoopTripCount;
+}
 
-  // Move data to device.
+/// Process data before launching the kernel, including calling targetDataBegin
+/// to map and transfer data to target device, transferring (first-)private
+/// variables.
+int processDataBefore(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
+                      void **ArgBases, void **Args, int64_t *ArgSizes,
+                      int64_t *ArgTypes, void **ArgMappers,
+                      std::vector<void *> &TgtArgs,
+                      std::vector<ptr
diff _t> &TgtOffsets,
+                      std::vector<void *> &FPArrays,
+                      __tgt_async_info *AsyncInfo) {
+  DeviceTy &Device = Devices[DeviceId];
   int Ret = targetDataBegin(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
-                            ArgMappers, &AsyncInfo);
+                            ArgMappers, AsyncInfo);
   if (Ret != OFFLOAD_SUCCESS) {
     DP("Call to targetDataBegin failed, abort target.\n");
     return OFFLOAD_FAIL;
   }
 
-  std::vector<void *> TgtArgs;
-  std::vector<ptr
diff _t> TgtOffsets;
-
   // List of (first-)private arrays allocated for this target region
-  std::vector<void *> FPArrays;
   std::vector<int> TgtArgsPositions(ArgNum, -1);
 
   for (int32_t I = 0; I < ArgNum; ++I) {
@@ -833,7 +830,7 @@ int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases,
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
-                                sizeof(void *), &AsyncInfo);
+                                sizeof(void *), AsyncInfo);
         if (Ret != OFFLOAD_SUCCESS) {
           DP("Copying data to device failed.\n");
           return OFFLOAD_FAIL;
@@ -873,8 +870,8 @@ int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases,
 #endif
       // If first-private, copy data from host
       if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) {
-        Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSizes[I],
-                                &AsyncInfo);
+        Ret =
+            Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSizes[I], AsyncInfo);
         if (Ret != OFFLOAD_SUCCESS) {
           DP("Copying data to device failed, failed.\n");
           return OFFLOAD_FAIL;
@@ -900,50 +897,112 @@ int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases,
   assert(TgtArgs.size() == TgtOffsets.size() &&
          "Size mismatch in arguments and offsets");
 
-  // Pop loop trip count
-  uint64_t LoopTripCount = 0;
-  {
-    std::lock_guard<std::mutex> TblMapLock(*TblMapMtx);
-    auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
-    if (I != Device.LoopTripCnt.end()) {
-      LoopTripCount = I->second;
-      Device.LoopTripCnt.erase(I);
-      DP("loop trip count is %lu.\n", LoopTripCount);
-    }
-  }
+  return OFFLOAD_SUCCESS;
+}
 
-  // Launch device execution.
-  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
-     TargetTable->EntriesBegin[TM->Index].name,
-     DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
-  if (IsTeamConstruct) {
-    Ret = Device.runTeamRegion(TargetTable->EntriesBegin[TM->Index].addr,
-                               &TgtArgs[0], &TgtOffsets[0], TgtArgs.size(),
-                               TeamNum, ThreadLimit, LoopTripCount, &AsyncInfo);
-  } else {
-    Ret =
-        Device.runRegion(TargetTable->EntriesBegin[TM->Index].addr, &TgtArgs[0],
-                         &TgtOffsets[0], TgtArgs.size(), &AsyncInfo);
-  }
+/// Process data after launching the kernel, including transferring data back to
+/// host if needed and deallocating target memory of (first-)private variables.
+/// FIXME: This function has correctness issue that target memory might be
+/// deallocated when they're being used.
+int processDataAfter(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
+                     void **ArgBases, void **Args, int64_t *ArgSizes,
+                     int64_t *ArgTypes, void **ArgMappers,
+                     std::vector<void *> &FPArrays,
+                     __tgt_async_info *AsyncInfo) {
+  DeviceTy &Device = Devices[DeviceId];
+
+  // Move data from device.
+  int Ret = targetDataEnd(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
+                          ArgMappers, AsyncInfo);
   if (Ret != OFFLOAD_SUCCESS) {
-    DP("Executing target region abort target.\n");
+    DP("Call to targetDataEnd failed, abort targe.\n");
     return OFFLOAD_FAIL;
   }
 
   // Deallocate (first-)private arrays
-  for (auto Itr : FPArrays) {
-    Ret = Device.deleteData(Itr);
+  for (void *P : FPArrays) {
+    Ret = Device.deleteData(P);
     if (Ret != OFFLOAD_SUCCESS) {
       DP("Deallocation of (first-)private arrays failed.\n");
       return OFFLOAD_FAIL;
     }
   }
 
-  // Move data from device.
-  Ret = targetDataEnd(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
-                      ArgMappers, &AsyncInfo);
+  return OFFLOAD_SUCCESS;
+}
+} // namespace
+
+/// performs the same actions as data_begin in case arg_num is
+/// non-zero and initiates run of the offloaded region on the target platform;
+/// if arg_num is non-zero after the region execution is done it also
+/// performs the same action as data_update and data_end above. This function
+/// returns 0 if it was able to transfer the execution to a target and an
+/// integer 
diff erent from zero otherwise.
+int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases,
+           void **Args, int64_t *ArgSizes, int64_t *ArgTypes, void **ArgMappers,
+           int32_t TeamNum, int32_t ThreadLimit, int IsTeamConstruct) {
+  DeviceTy &Device = Devices[DeviceId];
+
+  TableMap *TM = getTableMap(HostPtr);
+  // No map for this host pointer found!
+  if (!TM) {
+    DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
+       DPxPTR(HostPtr));
+    return OFFLOAD_FAIL;
+  }
+
+  // get target table.
+  __tgt_target_table *TargetTable = nullptr;
+  {
+    std::lock_guard<std::mutex> TrlTblLock(*TrlTblMtx);
+    assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
+           "Not expecting a device ID outside the table's bounds!");
+    TargetTable = TM->Table->TargetsTable[DeviceId];
+  }
+  assert(TargetTable && "Global data has not been mapped\n");
+
+  __tgt_async_info AsyncInfo;
+
+  std::vector<void *> TgtArgs;
+  std::vector<ptr
diff _t> TgtOffsets;
+  std::vector<void *> FPArrays;
+
+  // Process data, such as data mapping, before launching the kernel
+  int Ret = processDataBefore(DeviceId, HostPtr, ArgNum, ArgBases, Args,
+                              ArgSizes, ArgTypes, ArgMappers, TgtArgs,
+                              TgtOffsets, FPArrays, &AsyncInfo);
   if (Ret != OFFLOAD_SUCCESS) {
-    DP("Call to targetDataEnd failed, abort targe.\n");
+    DP("Failed to process data before launching the kernel.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  // Get loop trip count
+  uint64_t LoopTripCount = getLoopTripCount(DeviceId);
+
+  // Launch device execution.
+  void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
+  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
+     TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
+
+  if (IsTeamConstruct)
+    Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
+                               TgtArgs.size(), TeamNum, ThreadLimit,
+                               LoopTripCount, &AsyncInfo);
+  else
+    Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
+                           TgtArgs.size(), &AsyncInfo);
+
+  if (Ret != OFFLOAD_SUCCESS) {
+    DP("Executing target region abort target.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  // Transfer data back and deallocate target memory for (first-)private
+  // variables
+  Ret = processDataAfter(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes,
+                         ArgTypes, ArgMappers, FPArrays, &AsyncInfo);
+  if (Ret != OFFLOAD_SUCCESS) {
+    DP("Failed to process data after launching the kernel.\n");
     return OFFLOAD_FAIL;
   }
 


        


More information about the Openmp-commits mailing list