[Openmp-commits] [openmp] b316126 - [OpenMP][FIX] Avoid races in the handling of to be deleted mapping entries
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Mon Mar 28 20:34:18 PDT 2022
Author: Johannes Doerfert
Date: 2022-03-28T22:33:18-05:00
New Revision: b316126887d0e41a9e22717419d43af9d81b764c
URL: https://github.com/llvm/llvm-project/commit/b316126887d0e41a9e22717419d43af9d81b764c
DIFF: https://github.com/llvm/llvm-project/commit/b316126887d0e41a9e22717419d43af9d81b764c.diff
LOG: [OpenMP][FIX] Avoid races in the handling of to be deleted mapping entries
If we decided to delete a mapping entry we did not act on it right away
but first issued and waited for memory copies. In the meantime some
other thread might reuse the entry. While there was some logic to avoid
colliding on the actual "deletion" part, there were two races happening:
1) The data transfer back of the thread deleting the entry and
the data transfer back of the thread taking over the entry raced.
2) The update to the shadow map happened regardless if the entry was
actually reused by another thread which left the shadow map in a
inconsistent state.
To fix both issues we will now update the shadow map and delete the
entry only if we are sure the thread is responsible for deletion, hence
no other thread took over the entry and reused it. We also wait for a
potential former data transfer from the device to finish before we issue
another one that would race with it.
Fixes https://github.com/llvm/llvm-project/issues/54216
Differential Revision: https://reviews.llvm.org/D121058
Added:
openmp/libomptarget/test/mapping/map_back_race.cpp
Modified:
openmp/libomptarget/include/device.h
openmp/libomptarget/src/device.cpp
openmp/libomptarget/src/omptarget.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h
index cc5170e4c5722..da11189979d27 100644
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -21,6 +21,7 @@
#include <memory>
#include <mutex>
#include <set>
+#include <thread>
#include <vector>
#include "ExclusiveAccess.h"
@@ -60,7 +61,8 @@ struct HostDataToTargetTy {
struct StatesTy {
StatesTy(uint64_t DRC, uint64_t HRC)
: DynRefCount(DRC), HoldRefCount(HRC),
- MayContainAttachedPointers(false) {}
+ MayContainAttachedPointers(false), DeleteThreadId(std::thread::id()) {
+ }
/// The dynamic reference count is the standard reference count as of OpenMP
/// 4.5. The hold reference count is an OpenMP extension for the sake of
/// OpenACC support.
@@ -98,6 +100,14 @@ struct HostDataToTargetTy {
/// mechanism for D2H, and if the event cannot be shared between them, Event
/// should be written as <tt>void *Event[2]</tt>.
void *Event = nullptr;
+
+ /// The id of the thread responsible for deleting this entry. This thread
+ /// set the reference count to zero *last*. Other threads might reuse the
+ /// entry while it is marked for deletion but not yet deleted (e.g., the
+ /// data is still being moved back). If another thread reuses the entry we
+ /// will have a non-zero reference count *or* the thread will have changed
+ /// this id, effectively taking over deletion responsibility.
+ std::thread::id DeleteThreadId;
};
// When HostDataToTargetTy is used by std::set, std::set::iterator is const
// use unique_ptr to make States mutable.
@@ -138,6 +148,14 @@ struct HostDataToTargetTy {
/// Returns OFFLOAD_FAIL if something went wrong, OFFLOAD_SUCCESS otherwise.
int addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const;
+ /// Indicate that the current thread expected to delete this entry.
+ void setDeleteThreadId() const {
+ States->DeleteThreadId = std::this_thread::get_id();
+ }
+
+ /// Return the thread id of the thread expected to delete this entry.
+ std::thread::id getDeleteThreadId() const { return States->DeleteThreadId; }
+
/// Set the event bound to this data map.
void setEvent(void *Event) const { States->Event = Event; }
@@ -172,7 +190,7 @@ struct HostDataToTargetTy {
if (ThisRefCount > 0)
--ThisRefCount;
else
- assert(OtherRefCount > 0 && "total refcount underflow");
+ assert(OtherRefCount >= 0 && "total refcount underflow");
}
return getTotalRefCount();
}
@@ -362,14 +380,16 @@ struct DeviceTy {
bool UseHoldRefCount, bool &IsHostPtr,
bool MustContain = false,
bool ForceDelete = false);
- /// For the map entry for \p HstPtrBegin, decrement the reference count
- /// specified by \p HasHoldModifier and, if the the total reference count is
- /// then zero, deallocate the corresponding device storage and remove the map
- /// entry. Return \c OFFLOAD_SUCCESS if the map entry existed, and return
- /// \c OFFLOAD_FAIL if not. It is the caller's responsibility to skip calling
- /// this function if the map entry is not expected to exist because
- /// \p HstPtrBegin uses shared memory.
- int deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool HasHoldModifier);
+
+ /// Deallocate \p LR and remove the entry. Assume the total reference count is
+ /// zero and the calling thread is the deleting thread for \p LR. \p HDTTMap
+ /// ensure the caller holds exclusive access and can modify the map. Return \c
+ /// OFFLOAD_SUCCESS if the map entry existed, and return \c OFFLOAD_FAIL if
+ /// not. It is the caller's responsibility to skip calling this function if
+ /// the map entry is not expected to exist because \p HstPtrBegin uses shared
+ /// memory.
+ int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size);
+
int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
int disassociatePtr(void *HstPtrBegin);
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index df9e422c7e245..328d04cb1f8cb 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -20,6 +20,7 @@
#include <cstdint>
#include <cstdio>
#include <string>
+#include <thread>
int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
AsyncInfoTy &AsyncInfo) const {
@@ -207,9 +208,10 @@ TargetPointerResultTy DeviceTy::getTargetPointer(
((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && IsImplicit)) {
auto &HT = *LR.Entry;
const char *RefCountAction;
- assert(HT.getTotalRefCount() > 0 && "expected existing RefCount > 0");
if (UpdateRefCount) {
- // After this, RefCount > 1.
+ // After this, reference count >= 1. If the reference count was 0 but the
+ // entry was still there we can reuse the data on the device and avoid a
+ // new submission.
HT.incRefCount(HasHoldModifier);
RefCountAction = " (incremented)";
} else {
@@ -349,27 +351,30 @@ DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
if (lr.Flags.IsContained ||
(!MustContain && (lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter))) {
auto &HT = *lr.Entry;
- // We do not zero the total reference count here. deallocTgtPtr does that
- // atomically with removing the mapping. Otherwise, before this thread
- // removed the mapping in deallocTgtPtr, another thread could retrieve the
- // mapping, increment and decrement back to zero, and then both threads
- // would try to remove the mapping, resulting in a double free.
IsLast = HT.decShouldRemove(UseHoldRefCount, ForceDelete);
- const char *RefCountAction;
- if (!UpdateRefCount) {
- RefCountAction = " (update suppressed)";
- } else if (ForceDelete) {
+
+ if (ForceDelete) {
HT.resetRefCount(UseHoldRefCount);
assert(IsLast == HT.decShouldRemove(UseHoldRefCount) &&
"expected correct IsLast prediction for reset");
- if (IsLast)
- RefCountAction = " (reset, deferred final decrement)";
- else {
- HT.decRefCount(UseHoldRefCount);
- RefCountAction = " (reset)";
- }
+ }
+
+ const char *RefCountAction;
+ if (!UpdateRefCount) {
+ RefCountAction = " (update suppressed)";
} else if (IsLast) {
- RefCountAction = " (deferred final decrement)";
+ // Mark the entry as to be deleted by this thread. Another thread might
+ // reuse the entry and take "ownership" for the deletion while this thread
+ // is waiting for data transfers. That is fine and the current thread will
+ // simply skip the deletion step then.
+ HT.setDeleteThreadId();
+ HT.decRefCount(UseHoldRefCount);
+ assert(HT.getTotalRefCount() == 0 &&
+ "Expected zero reference count when deletion is scheduled");
+ if (ForceDelete)
+ RefCountAction = " (reset, delayed deletion)";
+ else
+ RefCountAction = " (decremented, delayed deletion)";
} else {
HT.decRefCount(UseHoldRefCount);
RefCountAction = " (decremented)";
@@ -411,37 +416,38 @@ void *DeviceTy::getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
return NULL;
}
-int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size,
- bool HasHoldModifier) {
- HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
-
+int DeviceTy::deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR,
+ int64_t Size) {
// Check if the pointer is contained in any sub-nodes.
- int Ret = OFFLOAD_SUCCESS;
- LookupResult lr = lookupMapping(HDTTMap, HstPtrBegin, Size);
- if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
- auto &HT = *lr.Entry;
- if (HT.decRefCount(HasHoldModifier) == 0) {
- DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n",
- DPxPTR(HT.TgtPtrBegin), Size);
- deleteData((void *)HT.TgtPtrBegin);
- INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
- "Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
- ", Size=%" PRId64 ", Name=%s\n",
- DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size,
- (HT.HstPtrName) ? getNameFromMapping(HT.HstPtrName).c_str()
- : "unknown");
- void *Event = lr.Entry->getEvent();
- HDTTMap->erase(lr.Entry);
- delete lr.Entry;
- if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) {
- REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event));
- Ret = OFFLOAD_FAIL;
- }
- }
- } else {
+ if (!(LR.Flags.IsContained || LR.Flags.ExtendsBefore ||
+ LR.Flags.ExtendsAfter)) {
REPORT("Section to delete (hst addr " DPxMOD ") does not exist in the"
" allocated memory\n",
- DPxPTR(HstPtrBegin));
+ DPxPTR(LR.Entry->HstPtrBegin));
+ return OFFLOAD_FAIL;
+ }
+
+ auto &HT = *LR.Entry;
+ // Verify this thread is still in charge of deleting the entry.
+ assert(HT.getTotalRefCount() == 0 &&
+ HT.getDeleteThreadId() == std::this_thread::get_id() &&
+ "Trying to delete entry that is in use or owned by another thread.");
+
+ DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n",
+ DPxPTR(HT.TgtPtrBegin), Size);
+ deleteData((void *)HT.TgtPtrBegin);
+ INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
+ "Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
+ ", Size=%" PRId64 ", Name=%s\n",
+ DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size,
+ (HT.HstPtrName) ? getNameFromMapping(HT.HstPtrName).c_str() : "unknown");
+ void *Event = LR.Entry->getEvent();
+ HDTTMap->erase(LR.Entry);
+ delete LR.Entry;
+
+ int Ret = OFFLOAD_SUCCESS;
+ if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) {
+ REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event));
Ret = OFFLOAD_FAIL;
}
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index d230bcdf2c8d8..62e46cab98c45 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -607,17 +607,29 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
namespace {
/// This structure contains information to deallocate a target pointer, aka.
-/// used to call the function \p DeviceTy::deallocTgtPtr.
-struct DeallocTgtPtrInfo {
+/// used to fix up the shadow map and potentially delete the entry from the
+/// mapping table via \p DeviceTy::deallocTgtPtr.
+struct PostProcessingInfo {
/// Host pointer used to look up into the map table
void *HstPtrBegin;
+
/// Size of the data
int64_t DataSize;
- /// Whether it has \p ompx_hold modifier
- bool HasHoldModifier;
- DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier)
- : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {}
+ /// The mapping type (bitfield).
+ int64_t ArgType;
+
+ /// The target pointer information.
+ TargetPointerResultTy TPR;
+
+ /// Are we expecting to delete this entry or not. Even if set, we might not
+ /// delete the entry if another thread reused the entry in the meantime.
+ bool DelEntry;
+
+ PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, bool DelEntry,
+ TargetPointerResultTy TPR)
+ : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), TPR(TPR),
+ DelEntry(DelEntry) {}
};
/// Apply \p CB to the shadow map pointer entries in the range \p Begin, to
@@ -668,7 +680,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
int Ret;
- std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
+ std::vector<PostProcessingInfo> PostProcessingPtrs;
void *FromMapperBase = nullptr;
// process each input.
for (int32_t I = ArgNum - 1; I >= 0; --I) {
@@ -786,12 +798,33 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
if ((Always || IsLast) && !IsHostPtr) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+
+ std::lock_guard<decltype(*TPR.Entry)> LG(*TPR.Entry);
+ // Wait for any previous transfer if an event is present.
+ if (void *Event = TPR.Entry->getEvent()) {
+ if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
+ REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
+ return OFFLOAD_FAIL;
+ }
+ }
+
Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data from device failed.\n");
return OFFLOAD_FAIL;
}
+
+ // As we are expecting to delete the entry the d2h copy might race
+ // with another one that also tries to delete the entry. This happens
+ // as the entry can be reused and the reuse might happen after the
+ // copy-back was issued but before it completed. Since the reuse might
+ // also copy-back a value we would race.
+ if (IsLast) {
+ if (TPR.Entry->addEventIfNecessary(Device, AsyncInfo) !=
+ OFFLOAD_SUCCESS)
+ return OFFLOAD_FAIL;
+ }
}
}
if (DelEntry && FromMapper && I == 0) {
@@ -799,38 +832,9 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
FromMapperBase = HstPtrBegin;
}
- // If we copied back to the host a struct/array containing pointers, we
- // need to restore the original host pointer values from their shadow
- // copies. If the struct is going to be deallocated, remove any remaining
- // shadow pointer entries for this struct.
- auto CB = [&](ShadowPtrListTy::iterator &Itr) {
- // If we copied the struct to the host, we need to restore the pointer.
- if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
- void **ShadowHstPtrAddr = (void **)Itr->first;
- // Wait for device-to-host memcopies for whole struct to complete,
- // before restoring the correct host pointer.
- if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
- *ShadowHstPtrAddr = Itr->second.HstPtrVal;
- DP("Restoring original host pointer value " DPxMOD " for host "
- "pointer " DPxMOD "\n",
- DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
- }
- // If the struct is to be deallocated, remove the shadow entry.
- if (DelEntry) {
- DP("Removing shadow pointer " DPxMOD "\n",
- DPxPTR((void **)Itr->first));
- Itr = Device.ShadowPtrMap.erase(Itr);
- } else {
- ++Itr;
- }
- return OFFLOAD_SUCCESS;
- };
- applyToShadowMapEntries(Device, CB, HstPtrBegin, DataSize, TPR);
-
- // Add pointer to the buffer for later deallocation
- if (DelEntry && !IsHostPtr)
- DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasHoldModifier);
+ // Add pointer to the buffer for post-synchronize processing.
+ PostProcessingPtrs.emplace_back(HstPtrBegin, DataSize, ArgTypes[I],
+ DelEntry && !IsHostPtr, TPR);
}
}
@@ -843,18 +847,66 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
return OFFLOAD_FAIL;
// Deallocate target pointer
- for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
- if (FromMapperBase && FromMapperBase == Info.HstPtrBegin)
- continue;
- Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
- Info.HasHoldModifier);
- if (Ret != OFFLOAD_SUCCESS) {
- REPORT("Deallocating data from device failed.\n");
- return OFFLOAD_FAIL;
+ for (PostProcessingInfo &Info : PostProcessingPtrs) {
+ // If we marked the entry to be deleted we need to verify no other thread
+ // reused it by now. If deletion is still supposed to happen by this thread
+ // LR will be set and exclusive access to the HDTT map will avoid another
+ // thread reusing the entry now. Note that we do not request (exclusive)
+ // access to the HDTT map if Info.DelEntry is not set.
+ LookupResult LR;
+ DeviceTy::HDTTMapAccessorTy HDTTMap =
+ Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry);
+
+ if (Info.DelEntry) {
+ LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize);
+ if (LR.Entry->getTotalRefCount() != 0 ||
+ LR.Entry->getDeleteThreadId() != std::this_thread::get_id()) {
+ // The thread is not in charge of deletion anymore. Give up access to
+ // the HDTT map and unset the deletion flag.
+ HDTTMap.destroy();
+ Info.DelEntry = false;
+ }
+ }
+
+ // If we copied back to the host a struct/array containing pointers, we
+ // need to restore the original host pointer values from their shadow
+ // copies. If the struct is going to be deallocated, remove any remaining
+ // shadow pointer entries for this struct.
+ auto CB = [&](ShadowPtrListTy::iterator &Itr) {
+ // If we copied the struct to the host, we need to restore the pointer.
+ if (Info.ArgType & OMP_TGT_MAPTYPE_FROM) {
+ void **ShadowHstPtrAddr = (void **)Itr->first;
+ *ShadowHstPtrAddr = Itr->second.HstPtrVal;
+ DP("Restoring original host pointer value " DPxMOD " for host "
+ "pointer " DPxMOD "\n",
+ DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
+ }
+ // If the struct is to be deallocated, remove the shadow entry.
+ if (Info.DelEntry) {
+ DP("Removing shadow pointer " DPxMOD "\n", DPxPTR((void **)Itr->first));
+ Itr = Device.ShadowPtrMap.erase(Itr);
+ } else {
+ ++Itr;
+ }
+ return OFFLOAD_SUCCESS;
+ };
+ applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize,
+ Info.TPR);
+
+ // If we are deleting the entry the DataMapMtx is locked and we own the
+ // entry.
+ if (Info.DelEntry) {
+ if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin)
+ Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize);
+
+ if (Ret != OFFLOAD_SUCCESS) {
+ REPORT("Deallocating data from device failed.\n");
+ break;
+ }
}
}
- return OFFLOAD_SUCCESS;
+ return Ret;
}
static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
diff --git a/openmp/libomptarget/test/mapping/map_back_race.cpp b/openmp/libomptarget/test/mapping/map_back_race.cpp
new file mode 100644
index 0000000000000..b81feb8aab724
--- /dev/null
+++ b/openmp/libomptarget/test/mapping/map_back_race.cpp
@@ -0,0 +1,32 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// Taken from https://github.com/llvm/llvm-project/issues/54216
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+bool almost_equal(float x, float gold, float rel_tol = 1e-09,
+ float abs_tol = 0.0) {
+ return std::abs(x - gold) <=
+ std::max(rel_tol * std::max(std::abs(x), std::abs(gold)), abs_tol);
+}
+void test_parallel_for__target() {
+ const int N0{32768};
+ const float expected_value{N0};
+ float counter_N0{};
+#pragma omp parallel for
+ for (int i0 = 0; i0 < N0; i0++) {
+#pragma omp target map(tofrom : counter_N0)
+ {
+#pragma omp atomic update
+ counter_N0 = counter_N0 + 1.;
+ }
+ }
+ if (!almost_equal(counter_N0, expected_value, 0.01)) {
+ std::cerr << "Expected: " << expected_value << " Got: " << counter_N0
+ << std::endl;
+ std::exit(112);
+ }
+}
+int main() { test_parallel_for__target(); }
More information about the Openmp-commits
mailing list