[Openmp-commits] [openmp] b0789a1 - [OpenMP] Avoid costly shadow map traversals whenever possible

Johannes Doerfert via Openmp-commits openmp-commits at lists.llvm.org
Wed Jan 19 20:14:54 PST 2022


Author: Johannes Doerfert
Date: 2022-01-19T22:14:41-06:00
New Revision: b0789a1b12cd1d63fdc008e33399ba64a898c722

URL: https://github.com/llvm/llvm-project/commit/b0789a1b12cd1d63fdc008e33399ba64a898c722
DIFF: https://github.com/llvm/llvm-project/commit/b0789a1b12cd1d63fdc008e33399ba64a898c722.diff

LOG: [OpenMP] Avoid costly shadow map traversals whenever possible

In the OpenMC app we saw `omp target update` spending an awful lot of
time in the shadow map traversal without ever doing any update there.
There are two cases that allow us to avoid the traversal completely.
The simplest thing is that small updates cannot (reasonably) contain
an attached pointer part. The other case requires to track in the
mapping table if an entry might contain an attached pointer as part.
Given that we have a single location shadow map entries are created,
the latter is actually fairly easy as well.

Differential Revision: https://reviews.llvm.org/D113124

Added: 
    

Modified: 
    openmp/libomptarget/include/device.h
    openmp/libomptarget/src/api.cpp
    openmp/libomptarget/src/device.cpp
    openmp/libomptarget/src/omptarget.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h
index 313b25b2c1d8..9d9b26ffd049 100644
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -57,7 +57,8 @@ struct HostDataToTargetTy {
 
   struct StatesTy {
     StatesTy(uint64_t DRC, uint64_t HRC)
-        : DynRefCount(DRC), HoldRefCount(HRC) {}
+        : DynRefCount(DRC), HoldRefCount(HRC),
+          MayContainAttachedPointers(false) {}
     /// The dynamic reference count is the standard reference count as of OpenMP
     /// 4.5.  The hold reference count is an OpenMP extension for the sake of
     /// OpenACC support.
@@ -75,6 +76,11 @@ struct HostDataToTargetTy {
     ///
     uint64_t DynRefCount;
     uint64_t HoldRefCount;
+
+    /// Boolean flag to remember if any subpart of the mapped region might be
+    /// an attached pointer.
+    bool MayContainAttachedPointers;
+
     /// This mutex will be locked when data movement is issued. For targets that
     /// doesn't support async data movement, this mutex can guarantee that after
     /// it is released, memory region on the target is update to date. For
@@ -196,6 +202,13 @@ struct HostDataToTargetTy {
     return ThisRefCount == 1;
   }
 
+  void setMayContainAttachedPointers() const {
+    States->MayContainAttachedPointers = true;
+  }
+  bool getMayContainAttachedPointers() const {
+    return States->MayContainAttachedPointers;
+  }
+
   /// Helper to make sure the entry is locked in a scope.
   /// TODO: We should generalize this and use it for all our objects that use
   /// lock/unlock methods.
@@ -320,10 +333,11 @@ struct DeviceTy {
                    bool HasCloseModifier, bool HasPresentModifier,
                    bool HasHoldModifier, AsyncInfoTy &AsyncInfo);
   void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
-  void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
-                       bool UpdateRefCount, bool UseHoldRefCount,
-                       bool &IsHostPtr, bool MustContain = false,
-                       bool ForceDelete = false);
+  TargetPointerResultTy getTgtPtrBegin(void *HstPtrBegin, int64_t Size,
+                                       bool &IsLast, bool UpdateRefCount,
+                                       bool UseHoldRefCount, bool &IsHostPtr,
+                                       bool MustContain = false,
+                                       bool ForceDelete = false);
   /// For the map entry for \p HstPtrBegin, decrement the reference count
   /// specified by \p HasHoldModifier and, if the the total reference count is
   /// then zero, deallocate the corresponding device storage and remove the map

diff  --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 47f30e69b113..5aaf5ad0ef7e 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -107,10 +107,11 @@ EXTERN int omp_target_is_present(const void *ptr, int device_num) {
   DeviceTy &Device = *PM->Devices[device_num];
   bool IsLast; // not used
   bool IsHostPtr;
-  void *TgtPtr = Device.getTgtPtrBegin(const_cast<void *>(ptr), 0, IsLast,
-                                       /*UpdateRefCount=*/false,
-                                       /*UseHoldRefCount=*/false, IsHostPtr);
-  int rc = (TgtPtr != NULL);
+  TargetPointerResultTy TPR =
+      Device.getTgtPtrBegin(const_cast<void *>(ptr), 0, IsLast,
+                            /*UpdateRefCount=*/false,
+                            /*UseHoldRefCount=*/false, IsHostPtr);
+  int rc = (TPR.TargetPointer != NULL);
   // Under unified memory the host pointer can be returned by the
   // getTgtPtrBegin() function which means that there is no device
   // corresponding point for ptr. This function should return false

diff  --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index a3f373f8dd53..4797c830c0c1 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -335,11 +335,12 @@ DeviceTy::getTargetPointer(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
 // Used by targetDataBegin, targetDataEnd, targetDataUpdate and target.
 // Return the target pointer begin (where the data will be moved).
 // Decrement the reference counter if called from targetDataEnd.
-void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
-                               bool UpdateRefCount, bool UseHoldRefCount,
-                               bool &IsHostPtr, bool MustContain,
-                               bool ForceDelete) {
-  void *rc = NULL;
+TargetPointerResultTy
+DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
+                         bool UpdateRefCount, bool UseHoldRefCount,
+                         bool &IsHostPtr, bool MustContain, bool ForceDelete) {
+  void *TargetPointer = NULL;
+  bool IsNew = false;
   IsHostPtr = false;
   IsLast = false;
   DataMapMtx.lock();
@@ -381,7 +382,7 @@ void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
          "Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s\n",
          DPxPTR(HstPtrBegin), DPxPTR(tp), Size, HT.dynRefCountToStr().c_str(),
          DynRefCountAction, HT.holdRefCountToStr().c_str(), HoldRefCountAction);
-    rc = (void *)tp;
+    TargetPointer = (void *)tp;
   } else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
     // If the value isn't found in the mapping and unified shared memory
     // is on then it means we have stumbled upon a value which we need to
@@ -390,11 +391,11 @@ void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
        "memory\n",
        DPxPTR((uintptr_t)HstPtrBegin), Size);
     IsHostPtr = true;
-    rc = HstPtrBegin;
+    TargetPointer = HstPtrBegin;
   }
 
   DataMapMtx.unlock();
-  return rc;
+  return {{IsNew, IsHostPtr}, lr.Entry, TargetPointer};
 }
 
 // Return the target pointer begin (where the data will be moved).

diff  --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 144ac3d89501..71a7a73f1018 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -17,6 +17,7 @@
 #include "rtl.h"
 
 #include <cassert>
+#include <cstdint>
 #include <vector>
 
 int AsyncInfoTy::synchronize() {
@@ -517,11 +518,10 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
 
     const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO;
     const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
-    auto TPR = Device.getTargetPointer(HstPtrBegin, HstPtrBase, data_size,
-                                       HstPtrName, HasFlagTo, HasFlagAlways,
-                                       IsImplicit, UpdateRef, HasCloseModifier,
-                                       HasPresentModifier, HasHoldModifier,
-                                       AsyncInfo);
+    auto TPR = Device.getTargetPointer(
+        HstPtrBegin, HstPtrBase, data_size, HstPtrName, HasFlagTo,
+        HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
+        HasPresentModifier, HasHoldModifier, AsyncInfo);
     void *TgtPtrBegin = TPR.TargetPointer;
     IsHostPtr = TPR.Flags.IsHostPointer;
     // If data_size==0, then the argument could be a zero-length pointer to
@@ -568,6 +568,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
         // create or update shadow pointers for this entry
         Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
             HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase};
+        Pointer_TPR.MapTableEntry->setMayContainAttachedPointers();
         UpdateDevPtr = true;
       }
 
@@ -612,6 +613,49 @@ struct DeallocTgtPtrInfo {
   DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier)
       : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {}
 };
+
+/// Apply \p CB to the shadow map pointer entries in the range \p Begin, to
+/// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the
+/// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the
+/// rest of the map is not checked anymore.
+template <typename CBTy>
+static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin,
+                                    uintptr_t Size,
+                                    const TargetPointerResultTy &TPR) {
+  // If we have an object that is too small to hold a pointer subobject, no need
+  // to do any checking.
+  if (Size < sizeof(void *))
+    return;
+
+  // If the map entry for the object was never marked as containing attached
+  // pointers, no need to do any checking.
+  if (TPR.MapTableEntry == HostDataToTargetListTy::iterator{} ||
+      !TPR.MapTableEntry->getMayContainAttachedPointers())
+    return;
+
+  uintptr_t LB = (uintptr_t)Begin;
+  uintptr_t UB = LB + Size;
+  // Now we are looking into the shadow map so we need to lock it.
+  Device.ShadowMtx.lock();
+  for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
+       Itr != Device.ShadowPtrMap.end();) {
+    uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first;
+
+    // An STL map is sorted on its keys; use this property
+    // to quickly determine when to break out of the loop.
+    if (ShadowHstPtrAddr < LB) {
+      ++Itr;
+      continue;
+    }
+    if (ShadowHstPtrAddr >= UB)
+      break;
+
+    if (CB(Itr) == OFFLOAD_FAIL)
+      break;
+  }
+  Device.ShadowMtx.unlock();
+}
+
 } // namespace
 
 /// Internal function to undo the mapping and retrieve the data from the device.
@@ -679,9 +723,10 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
     bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
 
     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
-    void *TgtPtrBegin = Device.getTgtPtrBegin(
+    TargetPointerResultTy TPR = Device.getTgtPtrBegin(
         HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr,
         !IsImplicit, ForceDelete);
+    void *TgtPtrBegin = TPR.TargetPointer;
     if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
       DP("Mapping does not exist (%s)\n",
          (HasPresentModifier ? "'present' map type modifier" : "ignored"));
@@ -754,38 +799,26 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
       // need to restore the original host pointer values from their shadow
       // copies. If the struct is going to be deallocated, remove any remaining
       // shadow pointer entries for this struct.
-      uintptr_t LB = (uintptr_t)HstPtrBegin;
-      uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
-      Device.ShadowMtx.lock();
-      for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
-           Itr != Device.ShadowPtrMap.end();) {
-        void **ShadowHstPtrAddr = (void **)Itr->first;
-
-        // An STL map is sorted on its keys; use this property
-        // to quickly determine when to break out of the loop.
-        if ((uintptr_t)ShadowHstPtrAddr < LB) {
-          ++Itr;
-          continue;
-        }
-        if ((uintptr_t)ShadowHstPtrAddr >= UB)
-          break;
-
+      auto CB = [&](ShadowPtrListTy::iterator &Itr) {
         // If we copied the struct to the host, we need to restore the pointer.
         if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
+          void **ShadowHstPtrAddr = (void **)Itr->first;
+          *ShadowHstPtrAddr = Itr->second.HstPtrVal;
           DP("Restoring original host pointer value " DPxMOD " for host "
              "pointer " DPxMOD "\n",
              DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
-          *ShadowHstPtrAddr = Itr->second.HstPtrVal;
         }
         // If the struct is to be deallocated, remove the shadow entry.
         if (DelEntry) {
-          DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
+          DP("Removing shadow pointer " DPxMOD "\n",
+             DPxPTR((void **)Itr->first));
           Itr = Device.ShadowPtrMap.erase(Itr);
         } else {
           ++Itr;
         }
-      }
-      Device.ShadowMtx.unlock();
+        return OFFLOAD_SUCCESS;
+      };
+      applyToShadowMapEntries(Device, CB, HstPtrBegin, DataSize, TPR);
 
       // Add pointer to the buffer for later deallocation
       if (DelEntry && !IsHostPtr)
@@ -821,9 +854,10 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
                                 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
   TIMESCOPE_WITH_IDENT(loc);
   bool IsLast, IsHostPtr;
-  void *TgtPtrBegin = Device.getTgtPtrBegin(
+  TargetPointerResultTy TPR = Device.getTgtPtrBegin(
       HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false,
       /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true);
+  void *TgtPtrBegin = TPR.TargetPointer;
   if (!TgtPtrBegin) {
     DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
     if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
@@ -850,22 +884,15 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
       return OFFLOAD_FAIL;
     }
 
-    uintptr_t LB = (uintptr_t)HstPtrBegin;
-    uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
-    Device.ShadowMtx.lock();
-    for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
-         IT != Device.ShadowPtrMap.end(); ++IT) {
-      void **ShadowHstPtrAddr = (void **)IT->first;
-      if ((uintptr_t)ShadowHstPtrAddr < LB)
-        continue;
-      if ((uintptr_t)ShadowHstPtrAddr >= UB)
-        break;
+    auto CB = [&](ShadowPtrListTy::iterator &Itr) {
+      void **ShadowHstPtrAddr = (void **)Itr->first;
+      *ShadowHstPtrAddr = Itr->second.HstPtrVal;
       DP("Restoring original host pointer value " DPxMOD
          " for host pointer " DPxMOD "\n",
-         DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
-      *ShadowHstPtrAddr = IT->second.HstPtrVal;
-    }
-    Device.ShadowMtx.unlock();
+         DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
+      return OFFLOAD_SUCCESS;
+    };
+    applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
   }
 
   if (ArgType & OMP_TGT_MAPTYPE_TO) {
@@ -877,28 +904,17 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
       return OFFLOAD_FAIL;
     }
 
-    uintptr_t LB = (uintptr_t)HstPtrBegin;
-    uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
-    Device.ShadowMtx.lock();
-    for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
-         IT != Device.ShadowPtrMap.end(); ++IT) {
-      void **ShadowHstPtrAddr = (void **)IT->first;
-      if ((uintptr_t)ShadowHstPtrAddr < LB)
-        continue;
-      if ((uintptr_t)ShadowHstPtrAddr >= UB)
-        break;
+    auto CB = [&](ShadowPtrListTy::iterator &Itr) {
       DP("Restoring original target pointer value " DPxMOD " for target "
          "pointer " DPxMOD "\n",
-         DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
-      Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
+         DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr));
+      Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal,
                               sizeof(void *), AsyncInfo);
-      if (Ret != OFFLOAD_SUCCESS) {
+      if (Ret != OFFLOAD_SUCCESS)
         REPORT("Copying data to device failed.\n");
-        Device.ShadowMtx.unlock();
-        return OFFLOAD_FAIL;
-      }
-    }
-    Device.ShadowMtx.unlock();
+      return Ret;
+    };
+    applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
   }
   return OFFLOAD_SUCCESS;
 }
@@ -1281,9 +1297,10 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
         void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
-        PointerTgtPtrBegin = Device.getTgtPtrBegin(
+        TargetPointerResultTy TPR = Device.getTgtPtrBegin(
             HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false,
             /*UseHoldRefCount=*/false, IsHostPtr);
+        PointerTgtPtrBegin = TPR.TargetPointer;
         if (!PointerTgtPtrBegin) {
           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
              DPxPTR(HstPtrVal));
@@ -1312,6 +1329,7 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
     ptr
diff _t TgtBaseOffset;
     bool IsLast, IsHostPtr; // unused.
+    TargetPointerResultTy TPR;
     if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
          DPxPTR(HstPtrBase));
@@ -1337,9 +1355,10 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
     } else {
       if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
         HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
-      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
-                                          /*UpdateRefCount=*/false,
-                                          /*UseHoldRefCount=*/false, IsHostPtr);
+      TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
+                                  /*UpdateRefCount=*/false,
+                                  /*UseHoldRefCount=*/false, IsHostPtr);
+      TgtPtrBegin = TPR.TargetPointer;
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
 #ifdef OMPTARGET_DEBUG
       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);


        


More information about the Openmp-commits mailing list