[Openmp-commits] [openmp] 7c8f4e7 - [OpenMP] Avoid costly shadow map traversals whenever possible

Joseph Huber via Openmp-commits openmp-commits at lists.llvm.org
Fri Dec 10 11:33:25 PST 2021


Author: Joseph Huber
Date: 2021-12-10T14:33:18-05:00
New Revision: 7c8f4e7b85ed98497f37571d72609f39a8eed447

URL: https://github.com/llvm/llvm-project/commit/7c8f4e7b85ed98497f37571d72609f39a8eed447
DIFF: https://github.com/llvm/llvm-project/commit/7c8f4e7b85ed98497f37571d72609f39a8eed447.diff

LOG: [OpenMP] Avoid costly shadow map traversals whenever possible

In the OpenMC app we saw `omp target update` spending an awful lot of
time in the shadow map traversal without ever doing any update there.
There are two cases that allow us to avoid the traversal completely.
The simplest thing is that small updates cannot (reasonably) contain
an attached pointer part. The other case requires to track in the
mapping table if an entry might contain an attached pointer as part.
Given that we have a single location shadow map entries are created,
the latter is actually fairly easy as well.

Reviewed By: grokos

Differential Revision: https://reviews.llvm.org/D113124

Added: 
    

Modified: 
    openmp/libomptarget/src/api.cpp
    openmp/libomptarget/src/device.cpp
    openmp/libomptarget/src/device.h
    openmp/libomptarget/src/omptarget.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 47f30e69b1136..5aaf5ad0ef7e4 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -107,10 +107,11 @@ EXTERN int omp_target_is_present(const void *ptr, int device_num) {
   DeviceTy &Device = *PM->Devices[device_num];
   bool IsLast; // not used
   bool IsHostPtr;
-  void *TgtPtr = Device.getTgtPtrBegin(const_cast<void *>(ptr), 0, IsLast,
-                                       /*UpdateRefCount=*/false,
-                                       /*UseHoldRefCount=*/false, IsHostPtr);
-  int rc = (TgtPtr != NULL);
+  TargetPointerResultTy TPR =
+      Device.getTgtPtrBegin(const_cast<void *>(ptr), 0, IsLast,
+                            /*UpdateRefCount=*/false,
+                            /*UseHoldRefCount=*/false, IsHostPtr);
+  int rc = (TPR.TargetPointer != NULL);
   // Under unified memory the host pointer can be returned by the
   // getTgtPtrBegin() function which means that there is no device
   // corresponding point for ptr. This function should return false

diff  --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 90d9947068209..1e6b8825d5a27 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -285,12 +285,13 @@ DeviceTy::getTargetPointer(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
 
 // Used by targetDataBegin, targetDataEnd, targetDataUpdate and target.
 // Return the target pointer begin (where the data will be moved).
-// Decrement the reference counter if called from targetDataEnd.
-void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
-                               bool UpdateRefCount, bool UseHoldRefCount,
-                               bool &IsHostPtr, bool MustContain,
-                               bool ForceDelete) {
-  void *rc = NULL;
+// Decrement the reference counter if called from targetDataEnd. The data is
+// excepted to be mapped already, so the result will never be new.
+TargetPointerResultTy
+DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
+                         bool UpdateRefCount, bool UseHoldRefCount,
+                         bool &IsHostPtr, bool MustContain, bool ForceDelete) {
+  void *TargetPointer = NULL;
   IsHostPtr = false;
   IsLast = false;
   DataMapMtx.lock();
@@ -332,7 +333,7 @@ void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
          "Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s\n",
          DPxPTR(HstPtrBegin), DPxPTR(tp), Size, HT.dynRefCountToStr().c_str(),
          DynRefCountAction, HT.holdRefCountToStr().c_str(), HoldRefCountAction);
-    rc = (void *)tp;
+    TargetPointer = (void *)tp;
   } else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
     // If the value isn't found in the mapping and unified shared memory
     // is on then it means we have stumbled upon a value which we need to
@@ -341,11 +342,11 @@ void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
        "memory\n",
        DPxPTR((uintptr_t)HstPtrBegin), Size);
     IsHostPtr = true;
-    rc = HstPtrBegin;
+    TargetPointer = HstPtrBegin;
   }
 
   DataMapMtx.unlock();
-  return rc;
+  return {{false, IsHostPtr}, lr.Entry, TargetPointer};
 }
 
 // Return the target pointer begin (where the data will be moved).

diff  --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h
index 75dde85a8806d..f41b690acbd24 100644
--- a/openmp/libomptarget/src/device.h
+++ b/openmp/libomptarget/src/device.h
@@ -57,7 +57,8 @@ struct HostDataToTargetTy {
 
   struct StatesTy {
     StatesTy(uint64_t DRC, uint64_t HRC)
-        : DynRefCount(DRC), HoldRefCount(HRC) {}
+        : DynRefCount(DRC), HoldRefCount(HRC),
+          MayContainAttachedPointers(false) {}
     /// The dynamic reference count is the standard reference count as of OpenMP
     /// 4.5.  The hold reference count is an OpenMP extension for the sake of
     /// OpenACC support.
@@ -75,6 +76,11 @@ struct HostDataToTargetTy {
     ///
     uint64_t DynRefCount;
     uint64_t HoldRefCount;
+
+    /// Boolean flag to remember if any subpart of the mapped region might be
+    /// an attached pointer.
+    bool MayContainAttachedPointers;
+
     /// This mutex will be locked when data movement is issued. For targets that
     /// doesn't support async data movement, this mutex can guarantee that after
     /// it is released, memory region on the target is update to date. For
@@ -181,6 +187,13 @@ struct HostDataToTargetTy {
   void lock() const { States->UpdateMtx.lock(); }
 
   void unlock() const { States->UpdateMtx.unlock(); }
+
+  void setMayContainAttachedPointers() const {
+    States->MayContainAttachedPointers = true;
+  }
+  bool getMayContainAttachedPointers() const {
+    return States->MayContainAttachedPointers;
+  }
 };
 
 typedef uintptr_t HstPtrBeginTy;
@@ -290,10 +303,11 @@ struct DeviceTy {
                    bool HasCloseModifier, bool HasPresentModifier,
                    bool HasHoldModifier, AsyncInfoTy &AsyncInfo);
   void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
-  void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
-                       bool UpdateRefCount, bool UseHoldRefCount,
-                       bool &IsHostPtr, bool MustContain = false,
-                       bool ForceDelete = false);
+  TargetPointerResultTy getTgtPtrBegin(void *HstPtrBegin, int64_t Size,
+                                       bool &IsLast, bool UpdateRefCount,
+                                       bool UseHoldRefCount, bool &IsHostPtr,
+                                       bool MustContain = false,
+                                       bool ForceDelete = false);
   /// For the map entry for \p HstPtrBegin, decrement the reference count
   /// specified by \p HasHoldModifier and, if the the total reference count is
   /// then zero, deallocate the corresponding device storage and remove the map

diff  --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 3e9f6427b4724..a08963c87eef9 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -17,6 +17,7 @@
 #include "rtl.h"
 
 #include <cassert>
+#include <cstdint>
 #include <vector>
 
 int AsyncInfoTy::synchronize() {
@@ -568,6 +569,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
         // create or update shadow pointers for this entry
         Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
             HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase};
+        TPR.MapTableEntry->setMayContainAttachedPointers();
         UpdateDevPtr = true;
       }
 
@@ -611,6 +613,48 @@ struct DeallocTgtPtrInfo {
   DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier)
       : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {}
 };
+
+/// Apply \p CB to the shadow map pointer entries in the range \p Begin, to
+/// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the
+/// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the
+/// rest of the map is not checked anymore.
+template <typename CBTy>
+static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin,
+                                    uintptr_t Size,
+                                    const TargetPointerResultTy &TPR) {
+  // If we have an object that is too small to hold a pointer subobject, no need
+  // to do any checking.
+  if (Size < sizeof(void *))
+    return;
+
+  // If the map entry for the object was never marked as containing attached
+  // pointers, no need to do any checking.
+  if (!TPR.MapTableEntry->getMayContainAttachedPointers())
+    return;
+
+  uintptr_t LB = (uintptr_t)Begin;
+  uintptr_t UB = LB + Size;
+  // Now we are looking into the shadow map so we need to lock it.
+  Device.ShadowMtx.lock();
+  for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
+       Itr != Device.ShadowPtrMap.end();) {
+    uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first;
+
+    // An STL map is sorted on its keys; use this property
+    // to quickly determine when to break out of the loop.
+    if (ShadowHstPtrAddr < LB) {
+      ++Itr;
+      continue;
+    }
+    if (ShadowHstPtrAddr >= UB)
+      break;
+
+    if (CB(Itr) == OFFLOAD_FAIL)
+      break;
+  }
+  Device.ShadowMtx.unlock();
+}
+
 } // namespace
 
 /// Internal function to undo the mapping and retrieve the data from the device.
@@ -678,9 +722,10 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
     bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
 
     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
-    void *TgtPtrBegin = Device.getTgtPtrBegin(
+    TargetPointerResultTy TPR = Device.getTgtPtrBegin(
         HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr,
         !IsImplicit, ForceDelete);
+    void *TgtPtrBegin = TPR.TargetPointer;
     if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
       DP("Mapping does not exist (%s)\n",
          (HasPresentModifier ? "'present' map type modifier" : "ignored"));
@@ -753,24 +798,10 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
       // need to restore the original host pointer values from their shadow
       // copies. If the struct is going to be deallocated, remove any remaining
       // shadow pointer entries for this struct.
-      uintptr_t LB = (uintptr_t)HstPtrBegin;
-      uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
-      Device.ShadowMtx.lock();
-      for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
-           Itr != Device.ShadowPtrMap.end();) {
-        void **ShadowHstPtrAddr = (void **)Itr->first;
-
-        // An STL map is sorted on its keys; use this property
-        // to quickly determine when to break out of the loop.
-        if ((uintptr_t)ShadowHstPtrAddr < LB) {
-          ++Itr;
-          continue;
-        }
-        if ((uintptr_t)ShadowHstPtrAddr >= UB)
-          break;
-
+      auto CB = [&](ShadowPtrListTy::iterator &Itr) {
         // If we copied the struct to the host, we need to restore the pointer.
         if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
+          void **ShadowHstPtrAddr = (void **)Itr->first;
           DP("Restoring original host pointer value " DPxMOD " for host "
              "pointer " DPxMOD "\n",
              DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
@@ -778,13 +809,15 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
         }
         // If the struct is to be deallocated, remove the shadow entry.
         if (DelEntry) {
-          DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
+          DP("Removing shadow pointer " DPxMOD "\n",
+             DPxPTR(Itr->second.HstPtrVal));
           Itr = Device.ShadowPtrMap.erase(Itr);
         } else {
           ++Itr;
         }
-      }
-      Device.ShadowMtx.unlock();
+        return OFFLOAD_SUCCESS;
+      };
+      applyToShadowMapEntries(Device, CB, HstPtrBegin, DataSize, TPR);
 
       // Add pointer to the buffer for later deallocation
       if (DelEntry && !IsHostPtr)
@@ -820,9 +853,10 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
                                 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
   TIMESCOPE_WITH_IDENT(loc);
   bool IsLast, IsHostPtr;
-  void *TgtPtrBegin = Device.getTgtPtrBegin(
+  TargetPointerResultTy TPR = Device.getTgtPtrBegin(
       HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false,
       /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true);
+  void *TgtPtrBegin = TPR.TargetPointer;
   if (!TgtPtrBegin) {
     DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
     if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
@@ -849,22 +883,15 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
       return OFFLOAD_FAIL;
     }
 
-    uintptr_t LB = (uintptr_t)HstPtrBegin;
-    uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
-    Device.ShadowMtx.lock();
-    for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
-         IT != Device.ShadowPtrMap.end(); ++IT) {
-      void **ShadowHstPtrAddr = (void **)IT->first;
-      if ((uintptr_t)ShadowHstPtrAddr < LB)
-        continue;
-      if ((uintptr_t)ShadowHstPtrAddr >= UB)
-        break;
+    auto CB = [&](ShadowPtrListTy::iterator &Itr) {
+      void **ShadowHstPtrAddr = (void **)Itr->first;
       DP("Restoring original host pointer value " DPxMOD
          " for host pointer " DPxMOD "\n",
-         DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
-      *ShadowHstPtrAddr = IT->second.HstPtrVal;
-    }
-    Device.ShadowMtx.unlock();
+         DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
+      *ShadowHstPtrAddr = Itr->second.HstPtrVal;
+      return OFFLOAD_SUCCESS;
+    };
+    applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
   }
 
   if (ArgType & OMP_TGT_MAPTYPE_TO) {
@@ -876,28 +903,17 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
       return OFFLOAD_FAIL;
     }
 
-    uintptr_t LB = (uintptr_t)HstPtrBegin;
-    uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
-    Device.ShadowMtx.lock();
-    for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
-         IT != Device.ShadowPtrMap.end(); ++IT) {
-      void **ShadowHstPtrAddr = (void **)IT->first;
-      if ((uintptr_t)ShadowHstPtrAddr < LB)
-        continue;
-      if ((uintptr_t)ShadowHstPtrAddr >= UB)
-        break;
+    auto CB = [&](ShadowPtrListTy::iterator &Itr) {
       DP("Restoring original target pointer value " DPxMOD " for target "
          "pointer " DPxMOD "\n",
-         DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
-      Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
+         DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr));
+      Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal,
                               sizeof(void *), AsyncInfo);
-      if (Ret != OFFLOAD_SUCCESS) {
+      if (Ret != OFFLOAD_SUCCESS)
         REPORT("Copying data to device failed.\n");
-        Device.ShadowMtx.unlock();
-        return OFFLOAD_FAIL;
-      }
-    }
-    Device.ShadowMtx.unlock();
+      return Ret;
+    };
+    applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
   }
   return OFFLOAD_SUCCESS;
 }
@@ -1280,9 +1296,10 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
         void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
-        PointerTgtPtrBegin = Device.getTgtPtrBegin(
+        TargetPointerResultTy TPR = Device.getTgtPtrBegin(
             HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false,
             /*UseHoldRefCount=*/false, IsHostPtr);
+        PointerTgtPtrBegin = TPR.TargetPointer;
         if (!PointerTgtPtrBegin) {
           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
              DPxPTR(HstPtrVal));
@@ -1311,6 +1328,7 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
     ptr
diff _t TgtBaseOffset;
     bool IsLast, IsHostPtr; // unused.
+    TargetPointerResultTy TPR;
     if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
          DPxPTR(HstPtrBase));
@@ -1336,9 +1354,10 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
     } else {
       if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
         HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
-      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
-                                          /*UpdateRefCount=*/false,
-                                          /*UseHoldRefCount=*/false, IsHostPtr);
+      TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
+                                  /*UpdateRefCount=*/false,
+                                  /*UseHoldRefCount=*/false, IsHostPtr);
+      TgtPtrBegin = TPR.TargetPointer;
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
 #ifdef OMPTARGET_DEBUG
       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);


        


More information about the Openmp-commits mailing list