[llvm] [Offload] Introduce ATTACH map-type support for pointer attachment. (PR #149036)
Abhinav Gaba via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 8 04:45:54 PDT 2025
https://github.com/abhinavgaba updated https://github.com/llvm/llvm-project/pull/149036
>From 5d72174220263f549c0d301ad9c4445a91bbde4a Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba at intel.com>
Date: Wed, 16 Jul 2025 00:53:09 -0700
Subject: [PATCH 1/9] [WIP][Offload] Introduce ATTACH map-type support for
pointer attachment.
This patch introduces libomptarget support for the ATTACH map-type,
which can be used to implement OpenMP conditional compliant pointer attachment,
based on whether the pointer/pointee is newly mapped on a given construct.
For example, for the following:
```c
int *p;
#pragma omp target enter data map(p[1:10])
```
The following maps can be emitted by clang:
```
(A)
&p[0], &p[1], 10 * sizeof(p[1]), TO | FROM
&p, &p[1], sizeof(p), ATTACH
```
Without this map-type, the two possible maps emitted by clang:
```
(B)
&p[0], &p[1], 10 * sizeof(p[1]), TO | FROM
(C)
&p, &p[1], 10 * sizeof(p[1]), TO | FROM | PTR_AND_OBJ
````
(B) does not perform any pointer attachment, while (C) also maps the
pointer p, which are both incorrect.
In terms of implementation, maps with the ATTACH map-type are handled after
all other maps have been processed, as it requires knowledge of which new
allocations happened as part of the construct. As per OpenMP 5.0, an
attachment should happen only when either the pointer or the pointee was
newly mapped while handling the construct.
Maps with ATTACH map-type-bit do not increase/decrease the ref-count.
With OpenMP 6.1, `attach(always/never)` can be used to force/prevent
attachment. For `attach(always)`, the compiler will insert the ALWAYS
map-type, which would let libomptarget bypass the check about one of the
pointer/pointee being new. With `attach(never)`, the ATTACH map will not
be emitted at all.
The size argument of the ATTACH map-type can specify values greater than
`sizeof(void*)` which can be used to support pointer attachment on Fortran
descriptors. Note that this also requires shadow-pointer tracking to also
support them. That has not been implemented in this patch.
This was worked upon in coordination with Ravi Narayanaswamy, who has
since retired. Happy retirement, Ravi!
---
offload/include/OpenMP/Mapping.h | 40 ++-
offload/include/omptarget.h | 3 +
offload/libomptarget/interface.cpp | 22 +-
offload/libomptarget/omptarget.cpp | 409 ++++++++++++++++++++++++++---
4 files changed, 438 insertions(+), 36 deletions(-)
diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h
index b9f5c16582931..93c1e56905ae4 100644
--- a/offload/include/OpenMP/Mapping.h
+++ b/offload/include/OpenMP/Mapping.h
@@ -417,12 +417,42 @@ struct MapperComponentsTy {
typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t,
void *);
+/// Structure to store information about a single ATTACH map entry.
+struct AttachMapInfo {
+ void *PointerBase;
+ void *PointeeBegin;
+ int64_t PointerSize;
+ int64_t MapType;
+ map_var_info_t Pointername;
+
+ AttachMapInfo(void *PointerBase, void *PointeeBegin, int64_t Size,
+ int64_t Type, map_var_info_t Name)
+ : PointerBase(PointerBase), PointeeBegin(PointeeBegin), PointerSize(Size),
+ MapType(Type), Pointername(Name) {}
+};
+
+/// Structure to track ATTACH entries and new allocations across recursive calls
+/// (for handling mappers) to targetDataBegin for a given construct.
+struct AttachInfoTy {
+ /// ATTACH map entries for deferred processing.
+ llvm::SmallVector<AttachMapInfo> AttachEntries;
+
+ /// Key: host pointer, Value: allocation size.
+ llvm::DenseMap<void *, int64_t> NewAllocations;
+
+ AttachInfoTy() = default;
+
+ // Delete copy constructor and copy assignment operator to prevent copying
+ AttachInfoTy(const AttachInfoTy &) = delete;
+ AttachInfoTy &operator=(const AttachInfoTy &) = delete;
+};
+
// Function pointer type for targetData* functions (targetDataBegin,
// targetDataEnd and targetDataUpdate).
typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
void **, int64_t *, int64_t *,
map_var_info_t *, void **, AsyncInfoTy &,
- bool);
+ AttachInfoTy *, bool);
void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
bool toStdOut = false);
@@ -431,20 +461,26 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgsBase, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
+ AttachInfoTy *AttachInfo = nullptr,
bool FromMapper = false);
int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
- bool FromMapper = false);
+ AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false);
int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgsBase, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
+ AttachInfoTy *AttachInfo = nullptr,
bool FromMapper = false);
+// Process deferred ATTACH map entries collected during targetDataBegin.
+int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
+ AsyncInfoTy &AsyncInfo);
+
struct MappingInfoTy {
MappingInfoTy(DeviceTy &Device) : Device(Device) {}
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 6971780c7bdb5..9e4bfd2f9cfbe 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -80,6 +80,9 @@ enum tgt_map_type {
// the structured region
// This is an OpenMP extension for the sake of OpenACC support.
OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000,
+ // Attach pointer and pointee, after processing all other maps.
+ // Applicable to map-entering directives. Does not change ref-count.
+ OMP_TGT_MAPTYPE_ATTACH = 0x4000,
// descriptor for non-contiguous target-update
OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000,
// member of struct, member given by [16 MSBs] - 1
diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index ea354400f2e99..16e46e6d23872 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -165,12 +165,28 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
OMPT_GET_RETURN_ADDRESS);)
int Rc = OFFLOAD_SUCCESS;
+
+ // Only allocate AttachInfo for targetDataBegin
+ AttachInfoTy *AttachInfo = nullptr;
+ if (TargetDataFunction == targetDataBegin) {
+ AttachInfo = new AttachInfoTy();
+ }
+
Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes,
ArgTypes, ArgNames, ArgMappers, AsyncInfo,
- false /*FromMapper=*/);
+ AttachInfo, false /*FromMapper=*/);
- if (Rc == OFFLOAD_SUCCESS)
- Rc = AsyncInfo.synchronize();
+ if (Rc == OFFLOAD_SUCCESS) {
+ // Process deferred ATTACH entries BEFORE synchronization
+ if (AttachInfo && !AttachInfo->AttachEntries.empty())
+ Rc = processAttachEntries(*DeviceOrErr, *AttachInfo, AsyncInfo);
+
+ if (Rc == OFFLOAD_SUCCESS)
+ Rc = AsyncInfo.synchronize();
+ }
+
+ if (AttachInfo)
+ delete AttachInfo;
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
}
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 5b25d955dd320..aa142814e8384 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -293,7 +293,8 @@ void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
void *ArgMapper, AsyncInfoTy &AsyncInfo,
- TargetDataFuncPtrTy TargetDataFunction) {
+ TargetDataFuncPtrTy TargetDataFunction,
+ AttachInfoTy *AttachInfo = nullptr) {
DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
// The mapper function fills up Components.
@@ -324,17 +325,196 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
MapperArgsBase.data(), MapperArgs.data(),
MapperArgSizes.data(), MapperArgTypes.data(),
MapperArgNames.data(), /*arg_mappers*/ nullptr,
- AsyncInfo, /*FromMapper=*/true);
+ AsyncInfo, AttachInfo, /*FromMapper=*/true);
return Rc;
}
+/// Utility function to perform a pointer attachment operation.
+///
+/// For something like:
+/// \code
+/// int *p;
+/// ...
+/// #pragma omp target enter data map(to:p[10:10])
+/// \endcode
+///
+/// for which the attachment operation gets represented using:
+/// \code
+/// &p, &p[10], sizeof(p), ATTACH
+/// \endcode
+///
+/// (Hst|Tgt)PtrAddr represents &p
+/// (Hst|Tgt)PteeBase represents &p[0]
+/// (Hst|Tgt)PteeBegin represents &p[10]
+///
+/// This function first computes the expected TgtPteeBase using:
+/// TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase)
+///
+/// and then attaches TgtPteeBase to TgtPtrAddr.
+///
+/// \p HstPtrSize represents the size of the pointer p. For C/C++, this
+/// should be same as "sizeof(void*)" (say 8).
+///
+/// However, for Fortran, pointers/allocatables, which are also eligible for
+/// "pointer-attachment", may be implemented using descriptors that contain the
+/// address of the pointee in the first 8 bytes, but also contain other
+/// information such as lower-bound/upper-bound etc in their subsequent fields.
+///
+/// For example, for the following:
+/// \code
+/// integer, allocatable :: x(:)
+/// integer, pointer :: p(:)
+/// ...
+/// p => x(10: 19)
+/// ...
+/// !$omp target enter data map(to:p(:))
+/// \endcode
+///
+/// The map should trigger a pointer-attachment (assuming the pointer-attachment
+/// conditions as noted on processAttachEntries are met) between the descriptor
+/// for p, and its pointee data.
+///
+/// Since only the first 8 bytes of the descriptor contain the address of the
+/// pointee, an attachment operation on device descriptors involves:
+/// * Setting the first 8 bytes of the device descriptor to point the device
+/// address of the pointee.
+/// * Copying the remaining information about bounds/offset etc. from the host
+/// descriptor to the device descriptor.
+///
+/// The function also handles pointer-attachment portion of PTR_AND_OBJ maps,
+/// like:
+/// \code
+/// &p, &p[10], 10 * sizeof(p[10]), PTR_AND_OBJ
+/// \endcoe
+/// by using "sizeof(void*)" as \p HstPtrSize.
+static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
+ void **HstPtrAddr, void *HstPteeBase,
+ void *HstPteeBegin, void **TgtPtrAddr,
+ void *TgtPteeBegin, int64_t HstPtrSize,
+ TargetPointerResultTy &PtrTPR) {
+ assert(PtrTPR.getEntry() &&
+ "Need a valid pointer entry to perform pointer-attachment");
+
+ int64_t VoidPtrSize = sizeof(void *);
+ assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small");
+
+ uint64_t Delta = (uint64_t)HstPteeBegin - (uint64_t)HstPteeBase;
+ void *TgtPteeBase = (void *)((uint64_t)TgtPteeBegin - Delta);
+
+ // Add shadow pointer tracking
+ // TODO: Support shadow-tracking of larger than VoidPtrSize pointers,
+ // to support restoration of Fortran descriptors. Currently, this check
+ // would return false, even if the host Fortran descriptor was, and we
+ // should have done an update of the device descriptor. e.g.
+ //
+ // !$omp target enter data map(x(1:100)) ! (1)
+ // p => x(10: 19)
+ // !$omp target enter data map(p, p(:)) ! (2)
+ // p => x(5: 9)
+ // !$omp target enter data map(attach(always): p(:)) ! (3)
+ //
+ // While PtrAddr(&desc_p) and PteeBase(&p(1)) are same for (2) and (3), the
+ // pointer attachment for (3) needs to update the bounds information
+ // in the descriptor of p on device.
+ if (!PtrTPR.getEntry()->addShadowPointer(
+ ShadowPtrInfoTy{HstPtrAddr, HstPteeBase, TgtPtrAddr, TgtPteeBase}))
+ return OFFLOAD_SUCCESS;
+
+ DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(TgtPtrAddr),
+ DPxPTR(TgtPteeBase));
+
+ // Lambda to handle submitData result and perform final steps.
+ auto HandleSubmitResult = [&](int SubmitResult) -> int {
+ if (SubmitResult != OFFLOAD_SUCCESS) {
+ REPORT("Failed to update pointer on device.\n");
+ return OFFLOAD_FAIL;
+ }
+
+ if (PtrTPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
+ OFFLOAD_SUCCESS)
+ return OFFLOAD_FAIL;
+
+ return OFFLOAD_SUCCESS;
+ };
+
+ bool IsPtrAFortranDescriptor = HstPtrSize > VoidPtrSize;
+ if (!IsPtrAFortranDescriptor) {
+ // For "regular" pointers, we can use the VoidPtrLocation from AsyncInfo as
+ // the buffer space for the submission.
+ void *&BufferElement = AsyncInfo.getVoidPtrLocation();
+ BufferElement = TgtPteeBase;
+
+ // Submit the updated pointer value to device
+ return HandleSubmitResult(Device.submitData(
+ TgtPtrAddr, &BufferElement, VoidPtrSize, AsyncInfo, PtrTPR.getEntry()));
+ }
+
+ // For larger "pointers" (like Fortran's descriptors), we create a dynamic
+ // buffer, which will be eventually destroyed by AsyncInfo's post-processing
+ // callback.
+ char *DataBuffer = new char[HstPtrSize];
+
+ // For such descriptors, to the first VoidPtrSize bytes, we store the
+ // pointee's device address.
+ std::memcpy(DataBuffer, &TgtPteeBase, sizeof(void *));
+
+ // And to the remaining bytes, we copy the remaining contents of the host
+ // descriptor after the initial VoidPtrSize bytes.
+ uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
+ void *HstDescriptorFieldsAddr = (char *)HstPtrAddr + VoidPtrSize;
+ std::memcpy(DataBuffer + VoidPtrSize, HstDescriptorFieldsAddr,
+ HstDescriptorFieldsSize);
+
+ DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD ") (pointer + %" PRId64
+ " additional bytes from host descriptor " DPxMOD ")\n",
+ HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize,
+ DPxPTR(HstDescriptorFieldsAddr));
+
+ // Submit the entire buffer to device
+ // FIXME: When handling ATTACH map-type, pointer attachment needs to happen
+ // after the other mapping operations are done, to avoid possibility of
+ // pending transfers clobbering the attachment, for example:
+ //
+ // int *p = ...;
+ // int **pp = &p;
+ // map(to: pp[0], p[0])
+ //
+ // Which would be represented by:
+ // &pp[0], &pp[0], sizeof(pp[0]), TO (1)
+ // &p[0], &p[0], sizeof(p[0]), TO (2)
+ //
+ // &pp, &pp[0], sizeof(pp), ATTACH (3)
+ // &p, &p[0], sizeof(p), ATTACH (4)
+ //
+ // (4) and (1) are both trying to modify the device memory corresponding to
+ // &p. We need to ensure that (4) happens last.
+ //
+ // One possible solution to this could be to insert a "device barrier" before
+ // the first ATTACH submitData call, so that every subsequent submitData waits
+ // for any prior operations to finish. Like:
+ // Device.submitData(..., /*InOrder=*/IsFirstAttachEntry)
+ // Where the boolean InOrder being true means that this submission should
+ // wait for prior memory submissions to finish.
+ int SubmitResult =
+ Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize, AsyncInfo,
+ PtrTPR.getEntry());
+
+ AsyncInfo.addPostProcessingFunction([DataBuffer]() -> int {
+ delete[] DataBuffer;
+ return OFFLOAD_SUCCESS;
+ });
+ return HandleSubmitResult(SubmitResult);
+}
+
/// Internal function to do the mapping and transfer the data to the device
int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgsBase, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
- bool FromMapper) {
+ AttachInfoTy *AttachInfo, bool FromMapper) {
+ assert(AttachInfo && "AttachInfo must be available for targetDataBegin for "
+ "handling ATTACH map-types.");
// process each input.
for (int32_t I = 0; I < ArgNum; ++I) {
// Ignore private variables and arrays - there is no mapping for them.
@@ -352,7 +532,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
- targetDataBegin);
+ targetDataBegin, AttachInfo);
if (Rc != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
@@ -369,6 +549,18 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
int64_t DataSize = ArgSizes[I];
map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
+ // ATTACH map-types are supposed to be handled after all mapping for the
+ // construct is done. Defer their processing.
+ if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) {
+ AttachInfo->AttachEntries.emplace_back(
+ /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin,
+ /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I],
+ /*PointeeName=*/HstPtrName);
+
+ DP("Deferring ATTACH map-type processing for argument %d\n", I);
+ continue;
+ }
+
// Adjust for proper alignment if this is a combined entry (for structs).
// Look at the next argument - if that is MEMBER_OF this one, then this one
// is a combined entry.
@@ -434,6 +626,11 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
: "device failure or illegal mapping");
return OFFLOAD_FAIL;
}
+
+ // Track new allocation, for eventual use in attachment decision-making.
+ if (PointerTpr.Flags.IsNewEntry && !IsHostPtr)
+ AttachInfo->NewAllocations[HstPtrBase] = sizeof(void *);
+
DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
"\n",
sizeof(void *), DPxPTR(PointerTgtPtrBegin),
@@ -464,6 +661,11 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
: "device failure or illegal mapping");
return OFFLOAD_FAIL;
}
+
+ // Track new allocation, for eventual use in attachment decision-making.
+ if (TPR.Flags.IsNewEntry && !IsHostPtr && TgtPtrBegin)
+ AttachInfo->NewAllocations[HstPtrBegin] = DataSize;
+
DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
" - is%s new\n",
DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
@@ -476,30 +678,12 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
}
if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
-
- uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
- void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
-
- if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{
- (void **)PointerHstPtrBegin, HstPtrBase,
- (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) {
- DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
- DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
-
- void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
- TgtPtrBase = ExpectedTgtPtrBase;
-
- int Ret =
- Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *),
- AsyncInfo, PointerTpr.getEntry());
- if (Ret != OFFLOAD_SUCCESS) {
- REPORT("Copying data to device failed.\n");
- return OFFLOAD_FAIL;
- }
- if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
- OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
- }
+ int Ret = performPointerAttachment(Device, AsyncInfo,
+ (void **)PointerHstPtrBegin, HstPtrBase, HstPtrBegin,
+ (void **)PointerTgtPtrBegin, TgtPtrBegin,
+ sizeof(void *), PointerTpr);
+ if (Ret != OFFLOAD_SUCCESS)
+ return OFFLOAD_FAIL;
}
// Check if variable can be used on the device:
@@ -515,6 +699,145 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
return OFFLOAD_SUCCESS;
}
+/// Process deferred ATTACH map entries collected during targetDataBegin.
+///
+/// From OpenMP's perspective, when mapping something that has a base pointer,
+/// such as:
+/// \code
+/// int *p;
+/// #pragma omp enter target data map(to: p[10:20])
+/// \endcode
+///
+/// a pointer-attachment between p and &p[10] should occur if both p and
+/// p[10] are present on the device after doing all allocations for all maps
+/// on the construct, and one of the following is true:
+///
+/// * The pointer p was newly allocated while handling the construct
+/// * The pointee p[10:20] was newly allocated while handling the construct
+/// * attach(always) map-type modifier was specified (OpenMP 6.1)
+///
+/// That's why we collect all attach entries and new memory allocations during
+/// targetDataBegin, and use that information to make the decision of whether
+/// to perform a pointer-attachment or not here, after maps have been handled.
+int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
+ AsyncInfoTy &AsyncInfo) {
+ // Report all tracked allocations from both main loop and ATTACH processing
+ if (!AttachInfo.NewAllocations.empty()) {
+ DP("Tracked %u total new allocations:\n",
+ (unsigned)AttachInfo.NewAllocations.size());
+ for (const auto &Alloc : AttachInfo.NewAllocations) {
+ DP(" Host ptr: " DPxMOD ", Size: %" PRId64 " bytes\n",
+ DPxPTR(Alloc.first), Alloc.second);
+ }
+ }
+
+ if (AttachInfo.AttachEntries.empty())
+ return OFFLOAD_SUCCESS;
+
+ DP("Processing %zu deferred ATTACH map entries\n",
+ AttachInfo.AttachEntries.size());
+
+ for (size_t EntryIdx = 0; EntryIdx < AttachInfo.AttachEntries.size();
+ ++EntryIdx) {
+ const auto &AttachEntry = AttachInfo.AttachEntries[EntryIdx];
+
+ void **HstPtr = (void **)AttachEntry.PointerBase;
+
+ void *HstPteeBase = *HstPtr;
+ void *HstPteeBegin = AttachEntry.PointeeBegin;
+
+ int64_t PtrSize = AttachEntry.PointerSize;
+ int64_t MapType = AttachEntry.MapType;
+
+ DP("Processing ATTACH entry %zu: HstPtr=" DPxMOD ", HstPteeBegin=" DPxMOD
+ ", Size=%" PRId64 ", Type=0x%" PRIx64 "\n",
+ EntryIdx, DPxPTR(HstPtr), DPxPTR(HstPteeBegin), PtrSize, MapType);
+
+ const bool IsAttachAlways = MapType & OMP_TGT_MAPTYPE_ALWAYS;
+
+ // Lambda to check if a pointer was newly allocated
+ auto WasNewlyAllocated = [&](void *Ptr, const char *PtrName) {
+ bool IsNewlyAllocated =
+ llvm::any_of(AttachInfo.NewAllocations, [&](const auto &Alloc) {
+ void *AllocPtr = Alloc.first;
+ int64_t AllocSize = Alloc.second;
+ return Ptr >= AllocPtr &&
+ Ptr < (void *)((char *)AllocPtr + AllocSize);
+ });
+ DP("ATTACH entry %zu: %s pointer " DPxMOD " was newly allocated: %s\n",
+ EntryIdx, PtrName, DPxPTR(Ptr), IsNewlyAllocated ? "yes" : "no");
+ return IsNewlyAllocated;
+ };
+
+ // Only process ATTACH if base/begin was newly allocated OR ALWAYS flag is
+ // set
+ if (!IsAttachAlways && !WasNewlyAllocated(HstPtr, "pointer") &&
+ !WasNewlyAllocated(HstPteeBegin, "pointee")) {
+ DP("Skipping ATTACH entry %zu: neither pointer nor pointee was newly "
+ "allocated and no ALWAYS flag\n",
+ EntryIdx);
+ continue;
+ }
+
+ DP("Processing ATTACH entry %zu: Always=%s\n", EntryIdx,
+ IsAttachAlways ? "yes" : "no");
+
+ // Lambda to perform target pointer lookup and validation
+ auto LookupTargetPointer = [&](void *Ptr, int64_t Size, const char *PtrType)
+ -> std::optional<TargetPointerResultTy> {
+ // ATTACH map-type does not change ref-count, or do any allocation
+ // We just need to do a lookup for the pointer/pointee.
+ TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
+ Ptr, Size, /*UpdateRefCount=*/false,
+ /*UseHoldRefCount=*/false, /*MustContain=*/true);
+
+ DP("ATTACH entry %zu: %s lookup - HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
+ ", IsPresent=%s, IsHostPtr=%s\n",
+ EntryIdx, PtrType, DPxPTR(Ptr), DPxPTR(TPR.TargetPointer),
+ TPR.isPresent() ? "yes" : "no",
+ TPR.Flags.IsHostPointer ? "yes" : "no");
+
+ if (!TPR.isPresent()) {
+ DP("Skipping ATTACH entry %zu: %s not present on device\n", EntryIdx,
+ PtrType);
+ return std::nullopt;
+ }
+ if (TPR.Flags.IsHostPointer) {
+ DP("Skipping ATTACH entry %zu: device version of the %s is a host "
+ "pointer.\n",
+ EntryIdx, PtrType);
+ return std::nullopt;
+ }
+
+ return TPR;
+ };
+
+ // Get device version of the pointer (e.g., &p)
+ auto PtrTPROpt = LookupTargetPointer(HstPtr, PtrSize, "pointer");
+ if (!PtrTPROpt)
+ continue;
+ TargetPointerResultTy &PtrTPR = *PtrTPROpt;
+ void **TgtPtrBase = (void **)PtrTPR.TargetPointer;
+
+ // Get device version of the pointee (e.g., &p[10])
+ auto PteeTPROpt = LookupTargetPointer(HstPteeBegin, 0, "pointee");
+ if (!PteeTPROpt)
+ continue;
+ void *TgtPteeBegin = PteeTPROpt->TargetPointer;
+
+ // Update the device pointer to point to device pointee.
+ int Ret = performPointerAttachment(Device, AsyncInfo, HstPtr, HstPteeBase,
+ HstPteeBegin, TgtPtrBase, TgtPteeBegin,
+ PtrSize, PtrTPR);
+ if (Ret != OFFLOAD_SUCCESS)
+ return OFFLOAD_FAIL;
+
+ DP("ATTACH entry %zu processed successfully\n", EntryIdx);
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
namespace {
/// This structure contains information to deallocate a target pointer, aka.
/// used to fix up the shadow map and potentially delete the entry from the
@@ -624,7 +947,8 @@ postProcessingTargetDataEnd(DeviceTy *Device,
int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
- void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
+ void **ArgMappers, AsyncInfoTy &AsyncInfo,
+ AttachInfoTy *AttachInfo, bool FromMapper) {
int Ret = OFFLOAD_SUCCESS;
auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
// process each input.
@@ -635,6 +959,14 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
+ // Ignore ATTACH entries - they should only be honored on map-entering
+ // directives. They may be encountered here while handling the "end" part of
+ // "#pragma omp target".
+ if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) {
+ DP("Ignoring ATTACH entry %d in targetDataEnd\n", I);
+ continue;
+ }
+
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataEnd, call the
// targetDataMapper variant which will call targetDataEnd again
@@ -900,7 +1232,8 @@ static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgsBase, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
- void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
+ void **ArgMappers, AsyncInfoTy &AsyncInfo,
+ AttachInfoTy *AttachInfo, bool FromMapper) {
// process each input.
for (int32_t I = 0; I < ArgNum; ++I) {
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
@@ -1213,13 +1546,27 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+ // Create AttachInfo for tracking any ATTACH entries, or new-allocations
+ // when handling the "begin" mapping for a target constructs.
+ AttachInfoTy AttachInfo;
+
int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
- ArgTypes, ArgNames, ArgMappers, AsyncInfo);
+ ArgTypes, ArgNames, ArgMappers, AsyncInfo,
+ &AttachInfo, false /*FromMapper=*/);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataBegin failed, abort target.\n");
return OFFLOAD_FAIL;
}
+ // Process collected ATTACH entries
+ if (!AttachInfo.AttachEntries.empty()) {
+ Ret = processAttachEntries(*DeviceOrErr, AttachInfo, AsyncInfo);
+ if (Ret != OFFLOAD_SUCCESS) {
+ REPORT("Failed to process ATTACH entries.\n");
+ return OFFLOAD_FAIL;
+ }
+ }
+
// List of (first-)private arrays allocated for this target region
SmallVector<int> TgtArgsPositions(ArgNum, -1);
>From b645d581e7add334bcdb11b5c845582c696d47cc Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba at intel.com>
Date: Wed, 16 Jul 2025 01:47:34 -0700
Subject: [PATCH 2/9] Minor format/stylistic changes.
---
offload/libomptarget/interface.cpp | 7 +++----
offload/libomptarget/omptarget.cpp | 18 +++++++++---------
2 files changed, 12 insertions(+), 13 deletions(-)
diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index 16e46e6d23872..1a65262f9dcda 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -168,13 +168,12 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
// Only allocate AttachInfo for targetDataBegin
AttachInfoTy *AttachInfo = nullptr;
- if (TargetDataFunction == targetDataBegin) {
+ if (TargetDataFunction == targetDataBegin)
AttachInfo = new AttachInfoTy();
- }
Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes,
- ArgTypes, ArgNames, ArgMappers, AsyncInfo,
- AttachInfo, false /*FromMapper=*/);
+ ArgTypes, ArgNames, ArgMappers, AsyncInfo, AttachInfo,
+ /*FromMapper=*/false);
if (Rc == OFFLOAD_SUCCESS) {
// Process deferred ATTACH entries BEFORE synchronization
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index aa142814e8384..b5bbc5a409e85 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -496,9 +496,8 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
// Device.submitData(..., /*InOrder=*/IsFirstAttachEntry)
// Where the boolean InOrder being true means that this submission should
// wait for prior memory submissions to finish.
- int SubmitResult =
- Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize, AsyncInfo,
- PtrTPR.getEntry());
+ int SubmitResult = Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize,
+ AsyncInfo, PtrTPR.getEntry());
AsyncInfo.addPostProcessingFunction([DataBuffer]() -> int {
delete[] DataBuffer;
@@ -678,10 +677,10 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
}
if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
- int Ret = performPointerAttachment(Device, AsyncInfo,
- (void **)PointerHstPtrBegin, HstPtrBase, HstPtrBegin,
- (void **)PointerTgtPtrBegin, TgtPtrBegin,
- sizeof(void *), PointerTpr);
+ int Ret = performPointerAttachment(
+ Device, AsyncInfo, (void **)PointerHstPtrBegin, HstPtrBase,
+ HstPtrBegin, (void **)PointerTgtPtrBegin, TgtPtrBegin, sizeof(void *),
+ PointerTpr);
if (Ret != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
}
@@ -783,8 +782,9 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
IsAttachAlways ? "yes" : "no");
// Lambda to perform target pointer lookup and validation
- auto LookupTargetPointer = [&](void *Ptr, int64_t Size, const char *PtrType)
- -> std::optional<TargetPointerResultTy> {
+ auto LookupTargetPointer =
+ [&](void *Ptr, int64_t Size,
+ const char *PtrType) -> std::optional<TargetPointerResultTy> {
// ATTACH map-type does not change ref-count, or do any allocation
// We just need to do a lookup for the pointer/pointee.
TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
>From 168a2b89a67ec161eb2adac1abbf671c3037a5da Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Fri, 25 Jul 2025 08:02:53 +0200
Subject: [PATCH 3/9] add dataFence to plugin interface
---
offload/include/device.h | 5 +++++
offload/libomptarget/device.cpp | 4 ++++
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 4 ++++
.../plugins-nextgen/common/include/PluginInterface.h | 8 ++++++++
.../plugins-nextgen/common/src/PluginInterface.cpp | 11 +++++++++++
offload/plugins-nextgen/cuda/src/rtl.cpp | 4 ++++
offload/plugins-nextgen/host/src/rtl.cpp | 4 ++++
7 files changed, 40 insertions(+)
diff --git a/offload/include/device.h b/offload/include/device.h
index f4b10abbaa3fd..226a9c8902354 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -98,6 +98,11 @@ struct DeviceTy {
int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
int64_t Size, AsyncInfoTy &AsyncInfo);
+ // Insert a data fence between previous data operations and the following
+ // operations if necessary for the device.
+ int32_t dataFence(AsyncInfoTy &AsyncInfo);
+
+
/// Notify the plugin about a new mapping starting at the host address
/// \p HstPtr and \p Size bytes.
int32_t notifyDataMapped(void *HstPtr, int64_t Size);
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index f88e30ae9e76b..6585286bf4285 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -191,6 +191,10 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
DstPtr, Size, AsyncInfo);
}
+int32_t DeviceTy::dataFence(AsyncInfoTy &AsyncInfo) {
+ return RTL->data_fence(RTLDeviceID, AsyncInfo);
+}
+
int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n",
DPxPTR(HstPtr), Size);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index f8db9bf0ae739..d436fa8cc685b 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2538,6 +2538,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
getAgent(), (uint64_t)Size);
}
+ Error dataFence(__tgt_async_info *Async) override {
+ return Plugin::success();
+ }
+
/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
// TODO: Implement this function.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 8c17a2ee07047..e4ea79542609d 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -891,6 +891,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ /// Instert a data fence between previous data operations and the following
+ /// operations if necessary for the device
+ virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0;
+
/// Exchange data between devices (device to device transfer). Calling this
/// function is only valid if GenericPlugin::isDataExchangable() passing the
/// two devices returns true.
@@ -1355,6 +1359,10 @@ struct GenericPluginTy {
int DstDeviceId, void *DstPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
+ /// Places a fence between previous data movements and following data movements
+ /// if necessary on the device
+ int32_t data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo);
+
/// Begin executing a kernel on the given device.
int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 94a050b559efe..5425e1ec3e21e 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2231,3 +2231,14 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
*KernelPtr = &Kernel;
return OFFLOAD_SUCCESS;
}
+
+int32_t GenericPluginTy::data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo ) {
+ auto Err = getDevice(DeviceId).dataFence(AsyncInfo);
+ if (Err) {
+ REPORT("Failure to place data fence on device %d: %s\n",
+ DeviceId, toString(std::move(Err)).data());
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
\ No newline at end of file
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 5a391a4d36006..e396ee8117d0a 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -858,6 +858,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::success();
}
+ Error dataFence(__tgt_async_info *Async) override {
+ return Plugin::success();
+ }
+
/// Initialize the device info for interoperability purposes.
Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
assert(Context && "Context is null");
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index d950572265b4c..58b1e69206722 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -295,6 +295,10 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
"dataExchangeImpl not supported");
}
+ Error dataFence(__tgt_async_info *Async) override {
+ return Plugin::success();
+ }
+
/// All functions are already synchronous. No need to do anything on this
/// synchronization function.
Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
>From 23702158609886016e01a4f3a1f3edb94d085660 Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba at intel.com>
Date: Sun, 27 Jul 2025 03:26:11 -0700
Subject: [PATCH 4/9] Clang-format fixes.
---
offload/include/device.h | 1 -
offload/plugins-nextgen/common/include/PluginInterface.h | 6 +++---
offload/plugins-nextgen/common/src/PluginInterface.cpp | 7 ++++---
3 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/offload/include/device.h b/offload/include/device.h
index 226a9c8902354..1e85bb1876c83 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -102,7 +102,6 @@ struct DeviceTy {
// operations if necessary for the device.
int32_t dataFence(AsyncInfoTy &AsyncInfo);
-
/// Notify the plugin about a new mapping starting at the host address
/// \p HstPtr and \p Size bytes.
int32_t notifyDataMapped(void *HstPtr, int64_t Size);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index e4ea79542609d..c9f40d2994a0b 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -891,7 +891,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
- /// Instert a data fence between previous data operations and the following
+ /// Instert a data fence between previous data operations and the following
/// operations if necessary for the device
virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0;
@@ -1359,8 +1359,8 @@ struct GenericPluginTy {
int DstDeviceId, void *DstPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
- /// Places a fence between previous data movements and following data movements
- /// if necessary on the device
+ /// Places a fence between previous data movements and following data
+ /// movements if necessary on the device
int32_t data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo);
/// Begin executing a kernel on the given device.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 5425e1ec3e21e..ae4433b05571b 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2232,11 +2232,12 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
return OFFLOAD_SUCCESS;
}
-int32_t GenericPluginTy::data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo ) {
+int32_t GenericPluginTy::data_fence(int32_t DeviceId,
+ __tgt_async_info *AsyncInfo) {
auto Err = getDevice(DeviceId).dataFence(AsyncInfo);
if (Err) {
- REPORT("Failure to place data fence on device %d: %s\n",
- DeviceId, toString(std::move(Err)).data());
+ REPORT("Failure to place data fence on device %d: %s\n", DeviceId,
+ toString(std::move(Err)).data());
return OFFLOAD_FAIL;
}
>From 280638949165611c293e23792c2481a8df7a45bc Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba at intel.com>
Date: Sun, 27 Jul 2025 04:37:36 -0700
Subject: [PATCH 5/9] Insert a data-fence before the first pointer-attachment.
---
offload/libomptarget/omptarget.cpp | 102 +++++++++++++++++------------
1 file changed, 60 insertions(+), 42 deletions(-)
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index b5bbc5a409e85..eebfa340c8472 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -333,23 +333,23 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
/// Utility function to perform a pointer attachment operation.
///
/// For something like:
-/// \code
+/// ```cpp
/// int *p;
/// ...
/// #pragma omp target enter data map(to:p[10:10])
-/// \endcode
+/// ```
///
/// for which the attachment operation gets represented using:
-/// \code
+/// ```
/// &p, &p[10], sizeof(p), ATTACH
-/// \endcode
+/// ```
///
/// (Hst|Tgt)PtrAddr represents &p
/// (Hst|Tgt)PteeBase represents &p[0]
/// (Hst|Tgt)PteeBegin represents &p[10]
///
/// This function first computes the expected TgtPteeBase using:
-/// TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase)
+/// `<Select>TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase)`
///
/// and then attaches TgtPteeBase to TgtPtrAddr.
///
@@ -362,14 +362,14 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
/// information such as lower-bound/upper-bound etc in their subsequent fields.
///
/// For example, for the following:
-/// \code
+/// ```fortran
/// integer, allocatable :: x(:)
/// integer, pointer :: p(:)
/// ...
/// p => x(10: 19)
/// ...
/// !$omp target enter data map(to:p(:))
-/// \endcode
+/// ```
///
/// The map should trigger a pointer-attachment (assuming the pointer-attachment
/// conditions as noted on processAttachEntries are met) between the descriptor
@@ -384,10 +384,10 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
///
/// The function also handles pointer-attachment portion of PTR_AND_OBJ maps,
/// like:
-/// \code
+/// ```
/// &p, &p[10], 10 * sizeof(p[10]), PTR_AND_OBJ
-/// \endcoe
-/// by using "sizeof(void*)" as \p HstPtrSize.
+/// ```
+/// by using `sizeof(void*)` as \p HstPtrSize.
static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
void **HstPtrAddr, void *HstPteeBase,
void *HstPteeBegin, void **TgtPtrAddr,
@@ -405,8 +405,9 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
// Add shadow pointer tracking
// TODO: Support shadow-tracking of larger than VoidPtrSize pointers,
// to support restoration of Fortran descriptors. Currently, this check
- // would return false, even if the host Fortran descriptor was, and we
- // should have done an update of the device descriptor. e.g.
+ // would return false, even if the host Fortran descriptor had been
+ // updated since its previous map, and we should have updated its
+ // device counterpart. e.g.
//
// !$omp target enter data map(x(1:100)) ! (1)
// p => x(10: 19)
@@ -472,30 +473,6 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
DPxPTR(HstDescriptorFieldsAddr));
// Submit the entire buffer to device
- // FIXME: When handling ATTACH map-type, pointer attachment needs to happen
- // after the other mapping operations are done, to avoid possibility of
- // pending transfers clobbering the attachment, for example:
- //
- // int *p = ...;
- // int **pp = &p;
- // map(to: pp[0], p[0])
- //
- // Which would be represented by:
- // &pp[0], &pp[0], sizeof(pp[0]), TO (1)
- // &p[0], &p[0], sizeof(p[0]), TO (2)
- //
- // &pp, &pp[0], sizeof(pp), ATTACH (3)
- // &p, &p[0], sizeof(p), ATTACH (4)
- //
- // (4) and (1) are both trying to modify the device memory corresponding to
- // &p. We need to ensure that (4) happens last.
- //
- // One possible solution to this could be to insert a "device barrier" before
- // the first ATTACH submitData call, so that every subsequent submitData waits
- // for any prior operations to finish. Like:
- // Device.submitData(..., /*InOrder=*/IsFirstAttachEntry)
- // Where the boolean InOrder being true means that this submission should
- // wait for prior memory submissions to finish.
int SubmitResult = Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize,
AsyncInfo, PtrTPR.getEntry());
@@ -702,10 +679,10 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
///
/// From OpenMP's perspective, when mapping something that has a base pointer,
/// such as:
-/// \code
+/// ```cpp
/// int *p;
/// #pragma omp enter target data map(to: p[10:20])
-/// \endcode
+/// ```
///
/// a pointer-attachment between p and &p[10] should occur if both p and
/// p[10] are present on the device after doing all allocations for all maps
@@ -718,6 +695,33 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
/// That's why we collect all attach entries and new memory allocations during
/// targetDataBegin, and use that information to make the decision of whether
/// to perform a pointer-attachment or not here, after maps have been handled.
+///
+/// Additionally, once we decide that a pointer-attachment should be performed,
+/// we need to make sure that it happens after any previously submitted data
+/// transfers have completed, to avoid the possibility of the pending transfers
+/// clobbering the attachment. For example:
+///
+/// ```cpp
+/// int *p = ...;
+/// int **pp = &p;
+/// map(to: pp[0], p[0])
+/// ```
+///
+/// Which would be represented by:
+/// ```
+/// &pp[0], &pp[0], sizeof(pp[0]), TO (1)
+/// &p[0], &p[0], sizeof(p[0]), TO (2)
+///
+/// &pp, &pp[0], sizeof(pp), ATTACH (3)
+/// &p, &p[0], sizeof(p), ATTACH (4)
+/// ```
+///
+/// (4) and (1) are both trying to modify the device memory corresponding to
+/// `&p`. So, if we decide that (4) should do an attachment, we also need to
+/// ensure that (4) happens after (1) is complete.
+///
+/// For this purpose, we insert a data_fence before the first
+/// pointer-attachment, (3), to ensure that all pending transfers finish first.
int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
AsyncInfoTy &AsyncInfo) {
// Report all tracked allocations from both main loop and ATTACH processing
@@ -736,6 +740,8 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
DP("Processing %zu deferred ATTACH map entries\n",
AttachInfo.AttachEntries.size());
+ int Ret = OFFLOAD_SUCCESS;
+ bool IsFirstPointerAttachment = true;
for (size_t EntryIdx = 0; EntryIdx < AttachInfo.AttachEntries.size();
++EntryIdx) {
const auto &AttachEntry = AttachInfo.AttachEntries[EntryIdx];
@@ -825,10 +831,22 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
continue;
void *TgtPteeBegin = PteeTPROpt->TargetPointer;
- // Update the device pointer to point to device pointee.
- int Ret = performPointerAttachment(Device, AsyncInfo, HstPtr, HstPteeBase,
- HstPteeBegin, TgtPtrBase, TgtPteeBegin,
- PtrSize, PtrTPR);
+ // Insert a data-fence before the first pointer-attachment.
+ if (IsFirstPointerAttachment) {
+ IsFirstPointerAttachment = false;
+ DP("Inserting a data fence before the first pointer attachment.\n");
+ Ret = Device.dataFence(AsyncInfo);
+ if (Ret != OFFLOAD_SUCCESS) {
+ REPORT("Failed to insert data fence.\n");
+ return OFFLOAD_FAIL;
+ }
+ }
+
+ // Do the pointer-attachment, i.e. update the device pointer to point to
+ // device pointee.
+ Ret = performPointerAttachment(Device, AsyncInfo, HstPtr, HstPteeBase,
+ HstPteeBegin, TgtPtrBase, TgtPteeBegin,
+ PtrSize, PtrTPR);
if (Ret != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
>From 40320d2adca0e293bc64fd66eb70e94515594920 Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba at intel.com>
Date: Wed, 30 Jul 2025 11:34:28 -0700
Subject: [PATCH 6/9] Use unique_ptr, avoid c-style casts.
---
offload/libomptarget/interface.cpp | 12 +++++-------
offload/libomptarget/omptarget.cpp | 25 +++++++++++++++----------
2 files changed, 20 insertions(+), 17 deletions(-)
diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index 1a65262f9dcda..661b14dd80f8e 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -30,6 +30,7 @@
#include <cstdint>
#include <cstdio>
#include <cstdlib>
+#include <memory>
#ifdef OMPT_SUPPORT
using namespace llvm::omp::target::ompt;
@@ -167,13 +168,13 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
int Rc = OFFLOAD_SUCCESS;
// Only allocate AttachInfo for targetDataBegin
- AttachInfoTy *AttachInfo = nullptr;
+ std::unique_ptr<AttachInfoTy> AttachInfo;
if (TargetDataFunction == targetDataBegin)
- AttachInfo = new AttachInfoTy();
+ AttachInfo = std::make_unique<AttachInfoTy>();
Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes,
- ArgTypes, ArgNames, ArgMappers, AsyncInfo, AttachInfo,
- /*FromMapper=*/false);
+ ArgTypes, ArgNames, ArgMappers, AsyncInfo,
+ AttachInfo.get(), /*FromMapper=*/false);
if (Rc == OFFLOAD_SUCCESS) {
// Process deferred ATTACH entries BEFORE synchronization
@@ -184,9 +185,6 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
Rc = AsyncInfo.synchronize();
}
- if (AttachInfo)
- delete AttachInfo;
-
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
}
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index eebfa340c8472..eec34a4f9eb3f 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -399,8 +399,10 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
int64_t VoidPtrSize = sizeof(void *);
assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small");
- uint64_t Delta = (uint64_t)HstPteeBegin - (uint64_t)HstPteeBase;
- void *TgtPteeBase = (void *)((uint64_t)TgtPteeBegin - Delta);
+ uint64_t Delta = reinterpret_cast<uint64_t>(HstPteeBegin) -
+ reinterpret_cast<uint64_t>(HstPteeBase);
+ void *TgtPteeBase = reinterpret_cast<void *>(
+ reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta);
// Add shadow pointer tracking
// TODO: Support shadow-tracking of larger than VoidPtrSize pointers,
@@ -463,7 +465,8 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
// And to the remaining bytes, we copy the remaining contents of the host
// descriptor after the initial VoidPtrSize bytes.
uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
- void *HstDescriptorFieldsAddr = (char *)HstPtrAddr + VoidPtrSize;
+ void *HstDescriptorFieldsAddr =
+ reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize;
std::memcpy(DataBuffer + VoidPtrSize, HstDescriptorFieldsAddr,
HstDescriptorFieldsSize);
@@ -613,7 +616,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
(PointerTpr.Flags.IsNewEntry ? "" : " not"));
PointerHstPtrBegin = HstPtrBase;
// modify current entry.
- HstPtrBase = *(void **)HstPtrBase;
+ HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
// No need to update pointee ref count for the first element of the
// subelement that comes from mapper.
UpdateRef =
@@ -655,9 +658,10 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
int Ret = performPointerAttachment(
- Device, AsyncInfo, (void **)PointerHstPtrBegin, HstPtrBase,
- HstPtrBegin, (void **)PointerTgtPtrBegin, TgtPtrBegin, sizeof(void *),
- PointerTpr);
+ Device, AsyncInfo, reinterpret_cast<void **>(PointerHstPtrBegin),
+ HstPtrBase, HstPtrBegin,
+ reinterpret_cast<void **>(PointerTgtPtrBegin), TgtPtrBegin,
+ sizeof(void *), PointerTpr);
if (Ret != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
}
@@ -746,7 +750,7 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
++EntryIdx) {
const auto &AttachEntry = AttachInfo.AttachEntries[EntryIdx];
- void **HstPtr = (void **)AttachEntry.PointerBase;
+ void **HstPtr = reinterpret_cast<void **>(AttachEntry.PointerBase);
void *HstPteeBase = *HstPtr;
void *HstPteeBegin = AttachEntry.PointeeBegin;
@@ -767,7 +771,8 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
void *AllocPtr = Alloc.first;
int64_t AllocSize = Alloc.second;
return Ptr >= AllocPtr &&
- Ptr < (void *)((char *)AllocPtr + AllocSize);
+ Ptr < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(AllocPtr) + AllocSize);
});
DP("ATTACH entry %zu: %s pointer " DPxMOD " was newly allocated: %s\n",
EntryIdx, PtrName, DPxPTR(Ptr), IsNewlyAllocated ? "yes" : "no");
@@ -823,7 +828,7 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
if (!PtrTPROpt)
continue;
TargetPointerResultTy &PtrTPR = *PtrTPROpt;
- void **TgtPtrBase = (void **)PtrTPR.TargetPointer;
+ void **TgtPtrBase = reinterpret_cast<void **>(PtrTPR.TargetPointer);
// Get device version of the pointee (e.g., &p[10])
auto PteeTPROpt = LookupTargetPointer(HstPteeBegin, 0, "pointee");
>From 39e4ab1c667e09e85d3eb6906ce4ec2fb1a9b6d1 Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba at intel.com>
Date: Fri, 1 Aug 2025 04:57:00 -0700
Subject: [PATCH 7/9] Add newline at end of file.
---
offload/plugins-nextgen/common/src/PluginInterface.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index ae4433b05571b..1b034d26df28f 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2242,4 +2242,4 @@ int32_t GenericPluginTy::data_fence(int32_t DeviceId,
}
return OFFLOAD_SUCCESS;
-}
\ No newline at end of file
+}
>From eb35a3ef9962d3773e513daf8179fe5d8ffdfdb9 Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba at intel.com>
Date: Tue, 5 Aug 2025 01:14:53 -0700
Subject: [PATCH 8/9] Clean-up some debug prints.
---
offload/libomptarget/omptarget.cpp | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index eec34a4f9eb3f..4dbb45e686a85 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -421,8 +421,11 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
// pointer attachment for (3) needs to update the bounds information
// in the descriptor of p on device.
if (!PtrTPR.getEntry()->addShadowPointer(
- ShadowPtrInfoTy{HstPtrAddr, HstPteeBase, TgtPtrAddr, TgtPteeBase}))
+ ShadowPtrInfoTy{HstPtrAddr, HstPteeBase, TgtPtrAddr, TgtPteeBase})) {
+ DP("Pointer " DPxMOD " is already attached to " DPxMOD "\n",
+ DPxPTR(TgtPtrAddr), DPxPTR(TgtPteeBase));
return OFFLOAD_SUCCESS;
+ }
DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(TgtPtrAddr),
DPxPTR(TgtPteeBase));
@@ -774,8 +777,8 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
Ptr < reinterpret_cast<void *>(
reinterpret_cast<char *>(AllocPtr) + AllocSize);
});
- DP("ATTACH entry %zu: %s pointer " DPxMOD " was newly allocated: %s\n",
- EntryIdx, PtrName, DPxPTR(Ptr), IsNewlyAllocated ? "yes" : "no");
+ DP("Attach %s " DPxMOD " was newly allocated: %s\n", PtrName, DPxPTR(Ptr),
+ IsNewlyAllocated ? "yes" : "no");
return IsNewlyAllocated;
};
@@ -789,9 +792,6 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
continue;
}
- DP("Processing ATTACH entry %zu: Always=%s\n", EntryIdx,
- IsAttachAlways ? "yes" : "no");
-
// Lambda to perform target pointer lookup and validation
auto LookupTargetPointer =
[&](void *Ptr, int64_t Size,
@@ -802,9 +802,7 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
Ptr, Size, /*UpdateRefCount=*/false,
/*UseHoldRefCount=*/false, /*MustContain=*/true);
- DP("ATTACH entry %zu: %s lookup - HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
- ", IsPresent=%s, IsHostPtr=%s\n",
- EntryIdx, PtrType, DPxPTR(Ptr), DPxPTR(TPR.TargetPointer),
+ DP("Attach %s lookup - IsPresent=%s, IsHostPtr=%s\n", PtrType,
TPR.isPresent() ? "yes" : "no",
TPR.Flags.IsHostPointer ? "yes" : "no");
>From 4517cd2bcd140a870c3846a52750ed8a1874bd3d Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba at intel.com>
Date: Fri, 8 Aug 2025 04:31:49 -0700
Subject: [PATCH 9/9] Do pointee lookup and release its TPR before pointer
lookup.
---
offload/libomptarget/omptarget.cpp | 25 ++++++++++++++-----------
1 file changed, 14 insertions(+), 11 deletions(-)
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 4dbb45e686a85..4b2200b6ebaa5 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -782,10 +782,10 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
return IsNewlyAllocated;
};
- // Only process ATTACH if base/begin was newly allocated OR ALWAYS flag is
- // set
- if (!IsAttachAlways && !WasNewlyAllocated(HstPtr, "pointer") &&
- !WasNewlyAllocated(HstPteeBegin, "pointee")) {
+ // Only process ATTACH if either the pointee or the pointer was newly
+ // allocated, or the ALWAYS flag is set.
+ if (!IsAttachAlways && !WasNewlyAllocated(HstPteeBegin, "pointee") &&
+ !WasNewlyAllocated(HstPtr, "pointer")) {
DP("Skipping ATTACH entry %zu: neither pointer nor pointee was newly "
"allocated and no ALWAYS flag\n",
EntryIdx);
@@ -821,19 +821,22 @@ int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo,
return TPR;
};
- // Get device version of the pointer (e.g., &p)
+ // Get device version of the pointee (e.g., &p[10]) first, as we can
+ // release its TPR after extracting the pointer value.
+ void *TgtPteeBegin;
+ if (auto PteeTPROpt = LookupTargetPointer(HstPteeBegin, 0, "pointee"))
+ TgtPteeBegin = PteeTPROpt->TargetPointer;
+ else
+ continue;
+
+ // Get device version of the pointer (e.g., &p) next. We need to keep its
+ // TPR for use in shadow-pointer handling during pointer-attachment.
auto PtrTPROpt = LookupTargetPointer(HstPtr, PtrSize, "pointer");
if (!PtrTPROpt)
continue;
TargetPointerResultTy &PtrTPR = *PtrTPROpt;
void **TgtPtrBase = reinterpret_cast<void **>(PtrTPR.TargetPointer);
- // Get device version of the pointee (e.g., &p[10])
- auto PteeTPROpt = LookupTargetPointer(HstPteeBegin, 0, "pointee");
- if (!PteeTPROpt)
- continue;
- void *TgtPteeBegin = PteeTPROpt->TargetPointer;
-
// Insert a data-fence before the first pointer-attachment.
if (IsFirstPointerAttachment) {
IsFirstPointerAttachment = false;
More information about the llvm-commits
mailing list