[Openmp-commits] [openmp] 6847bce - [libomptarget] Add support for target update non-contiguous

via Openmp-commits openmp-commits at lists.llvm.org
Fri Nov 6 18:55:45 PST 2020


Author: cchen
Date: 2020-11-06T20:55:33-06:00
New Revision: 6847bcec1aa9e262e2b175926d94a12fc1174c6d

URL: https://github.com/llvm/llvm-project/commit/6847bcec1aa9e262e2b175926d94a12fc1174c6d
DIFF: https://github.com/llvm/llvm-project/commit/6847bcec1aa9e262e2b175926d94a12fc1174c6d.diff

LOG: [libomptarget] Add support for target update non-contiguous

This patch is the runtime support for https://reviews.llvm.org/D84192.

In order not to modify the tgt_target_data_update information but still be
able to pass the extra information for non-contiguous map item (offset,
count, and stride for each dimension), this patch overload arg when
the maptype is set as OMP_TGT_MAPTYPE_DESCRIPTOR. The origin arg is for
passing the pointer information, however, the overloaded arg is an
array of descriptor_dim:

```
struct descriptor_dim {
  int64_t offset;
  int64_t count;
  int64_t stride
};
```

and the array size is the dimension size. In addition, since we
have count and stride information in descriptor_dim, we can replace/overload the
arg_size parameter by using dimension size.

Reviewed By: grokos

Differential Revision: https://reviews.llvm.org/D82245

Added: 
    openmp/libomptarget/test/offloading/non_contiguous_update.cpp

Modified: 
    openmp/libomptarget/include/omptarget.h
    openmp/libomptarget/src/omptarget.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 9e7c28b14f8b..3f9761afd0aa 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -50,6 +50,8 @@ enum tgt_map_type {
   OMP_TGT_MAPTYPE_CLOSE           = 0x400,
   // runtime error if not already allocated
   OMP_TGT_MAPTYPE_PRESENT         = 0x1000,
+  // descriptor for non-contiguous target-update
+  OMP_TGT_MAPTYPE_NON_CONTIG      = 0x100000000000,
   // member of struct, member given by [16 MSBs] - 1
   OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
 };
@@ -121,6 +123,13 @@ struct __tgt_async_info {
   void *Queue = nullptr;
 };
 
+/// This struct is a record of non-contiguous information
+struct __tgt_target_non_contig {
+  uint64_t offset;
+  uint64_t count;
+  uint64_t stride;
+};
+
 #ifdef __cplusplus
 extern "C" {
 #endif

diff  --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 7548b81f9143..274ebae8633d 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -635,6 +635,133 @@ int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases,
   return OFFLOAD_SUCCESS;
 }
 
+static int target_data_contiguous(DeviceTy &Device, void *args_base,
+                                  void *HstPtrBegin, int64_t MapSize,
+                                  int64_t arg_type) {
+  bool IsLast, IsHostPtr;
+  void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast, false,
+                                            IsHostPtr, /*MustContain=*/true);
+  if (!TgtPtrBegin) {
+    DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
+    if (arg_type & OMP_TGT_MAPTYPE_PRESENT) {
+      MESSAGE("device mapping required by 'present' motion modifier does not "
+              "exist for host address " DPxMOD " (%" PRId64 " bytes)",
+              DPxPTR(HstPtrBegin), MapSize);
+      return OFFLOAD_FAIL;
+    }
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+      TgtPtrBegin == HstPtrBegin) {
+    DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
+       DPxPTR(HstPtrBegin));
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (arg_type & OMP_TGT_MAPTYPE_FROM) {
+    DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+       arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+    int rt = Device.retrieveData(HstPtrBegin, TgtPtrBegin, MapSize, nullptr);
+    if (rt != OFFLOAD_SUCCESS) {
+      REPORT("Copying data from device failed.\n");
+      return OFFLOAD_FAIL;
+    }
+
+    uintptr_t lb = (uintptr_t)HstPtrBegin;
+    uintptr_t ub = (uintptr_t)HstPtrBegin + MapSize;
+    Device.ShadowMtx.lock();
+    for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+         it != Device.ShadowPtrMap.end(); ++it) {
+      void **ShadowHstPtrAddr = (void **)it->first;
+      if ((uintptr_t)ShadowHstPtrAddr < lb)
+        continue;
+      if ((uintptr_t)ShadowHstPtrAddr >= ub)
+        break;
+      DP("Restoring original host pointer value " DPxMOD
+         " for host pointer " DPxMOD "\n",
+         DPxPTR(it->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
+      *ShadowHstPtrAddr = it->second.HstPtrVal;
+    }
+    Device.ShadowMtx.unlock();
+  }
+
+  if (arg_type & OMP_TGT_MAPTYPE_TO) {
+    DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+       arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+    int rt = Device.submitData(TgtPtrBegin, HstPtrBegin, MapSize, nullptr);
+    if (rt != OFFLOAD_SUCCESS) {
+      REPORT("Copying data to device failed.\n");
+      return OFFLOAD_FAIL;
+    }
+
+    uintptr_t lb = (uintptr_t)HstPtrBegin;
+    uintptr_t ub = (uintptr_t)HstPtrBegin + MapSize;
+    Device.ShadowMtx.lock();
+    for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+         it != Device.ShadowPtrMap.end(); ++it) {
+      void **ShadowHstPtrAddr = (void **)it->first;
+      if ((uintptr_t)ShadowHstPtrAddr < lb)
+        continue;
+      if ((uintptr_t)ShadowHstPtrAddr >= ub)
+        break;
+      DP("Restoring original target pointer value " DPxMOD " for target "
+         "pointer " DPxMOD "\n",
+         DPxPTR(it->second.TgtPtrVal), DPxPTR(it->second.TgtPtrAddr));
+      rt = Device.submitData(it->second.TgtPtrAddr, &it->second.TgtPtrVal,
+                             sizeof(void *), nullptr);
+      if (rt != OFFLOAD_SUCCESS) {
+        REPORT("Copying data to device failed.\n");
+        Device.ShadowMtx.unlock();
+        return OFFLOAD_FAIL;
+      }
+    }
+    Device.ShadowMtx.unlock();
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+static int target_data_non_contiguous(DeviceTy &Device, void *arg_base,
+                                      __tgt_target_non_contig *non_contig,
+                                      uint64_t size, int64_t arg_type,
+                                      int current_dim, int dim_size,
+                                      uint64_t offset) {
+  int rt = OFFLOAD_SUCCESS;
+  if (current_dim < dim_size) {
+    for (unsigned int i = 0; i < non_contig[current_dim].count; ++i) {
+      uint64_t cur_offset =
+          (non_contig[current_dim].offset + i) * non_contig[current_dim].stride;
+      // we only need to transfer the first element for the last dimension
+      // since we've already got a contiguous piece.
+      if (current_dim != dim_size - 1 || i == 0) {
+        rt = target_data_non_contiguous(Device, arg_base, non_contig, size,
+                                        arg_type, current_dim + 1, dim_size,
+                                        offset + cur_offset);
+        // Stop the whole process if any contiguous piece returns anything
+        // other than OFFLOAD_SUCCESS.
+        if (rt != OFFLOAD_SUCCESS)
+          return rt;
+      }
+    }
+  } else {
+    char *ptr = (char *)arg_base + offset;
+    DP("Transfer of non-contiguous : host ptr %lx offset %ld len %ld\n",
+       (uint64_t)ptr, offset, size);
+    rt = target_data_contiguous(Device, arg_base, ptr, size, arg_type);
+  }
+  return rt;
+}
+
+static int get_non_contig_merged_dimension(__tgt_target_non_contig *non_contig,
+                                           int32_t dim_size) {
+  int removed_dim = 0;
+  for (int i = dim_size - 1; i > 0; --i) {
+    if (non_contig[i].count * non_contig[i].stride == non_contig[i - 1].stride)
+      removed_dim++;
+  }
+  return removed_dim;
+}
+
 /// Internal function to pass data to/from the target.
 // async_info_ptr is currently unused, added here so target_data_update has the
 // same signature as targetDataBegin and targetDataEnd.
@@ -668,88 +795,24 @@ int target_data_update(DeviceTy &Device, int32_t arg_num,
       continue;
     }
 
-    void *HstPtrBegin = args[i];
-    int64_t MapSize = arg_sizes[i];
-    bool IsLast, IsHostPtr;
-    void *TgtPtrBegin = Device.getTgtPtrBegin(
-        HstPtrBegin, MapSize, IsLast, false, IsHostPtr, /*MustContain=*/true);
-    if (!TgtPtrBegin) {
-      DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
-      if (arg_types[i] & OMP_TGT_MAPTYPE_PRESENT) {
-        MESSAGE("device mapping required by 'present' motion modifier does not "
-                "exist for host address " DPxMOD " (%" PRId64 " bytes)",
-                DPxPTR(HstPtrBegin), MapSize);
-        return OFFLOAD_FAIL;
-      }
-      continue;
-    }
-
-    if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-        TgtPtrBegin == HstPtrBegin) {
-      DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
-         DPxPTR(HstPtrBegin));
-      continue;
-    }
-
-    if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
-      DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
-          arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-      int rt = Device.retrieveData(HstPtrBegin, TgtPtrBegin, MapSize, nullptr);
-      if (rt != OFFLOAD_SUCCESS) {
-        REPORT("Copying data from device failed.\n");
-        return OFFLOAD_FAIL;
-      }
-
-      uintptr_t lb = (uintptr_t) HstPtrBegin;
-      uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
-      Device.ShadowMtx.lock();
-      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
-          it != Device.ShadowPtrMap.end(); ++it) {
-        void **ShadowHstPtrAddr = (void**) it->first;
-        if ((uintptr_t) ShadowHstPtrAddr < lb)
-          continue;
-        if ((uintptr_t) ShadowHstPtrAddr >= ub)
-          break;
-        DP("Restoring original host pointer value " DPxMOD " for host pointer "
-            DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
-            DPxPTR(ShadowHstPtrAddr));
-        *ShadowHstPtrAddr = it->second.HstPtrVal;
-      }
-      Device.ShadowMtx.unlock();
-    }
-
-    if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
-      DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
-          arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-      int rt = Device.submitData(TgtPtrBegin, HstPtrBegin, MapSize, nullptr);
-      if (rt != OFFLOAD_SUCCESS) {
-        REPORT("Copying data to device failed.\n");
-        return OFFLOAD_FAIL;
-      }
-
-      uintptr_t lb = (uintptr_t) HstPtrBegin;
-      uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
-      Device.ShadowMtx.lock();
-      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
-          it != Device.ShadowPtrMap.end(); ++it) {
-        void **ShadowHstPtrAddr = (void **)it->first;
-        if ((uintptr_t)ShadowHstPtrAddr < lb)
-          continue;
-        if ((uintptr_t)ShadowHstPtrAddr >= ub)
-          break;
-        DP("Restoring original target pointer value " DPxMOD " for target "
-           "pointer " DPxMOD "\n",
-           DPxPTR(it->second.TgtPtrVal), DPxPTR(it->second.TgtPtrAddr));
-        rt = Device.submitData(it->second.TgtPtrAddr, &it->second.TgtPtrVal,
-                               sizeof(void *), nullptr);
-        if (rt != OFFLOAD_SUCCESS) {
-          REPORT("Copying data to device failed.\n");
-          Device.ShadowMtx.unlock();
-          return OFFLOAD_FAIL;
-        }
-      }
-      Device.ShadowMtx.unlock();
+    int rt = OFFLOAD_SUCCESS;
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_NON_CONTIG) {
+      __tgt_target_non_contig *non_contig = (__tgt_target_non_contig *)args[i];
+      int32_t dim_size = arg_sizes[i];
+      uint64_t size =
+          non_contig[dim_size - 1].count * non_contig[dim_size - 1].stride;
+      int32_t merged_dim =
+          get_non_contig_merged_dimension(non_contig, dim_size);
+      rt = target_data_non_contiguous(
+          Device, args_base[i], non_contig, size, arg_types[i],
+          /*current_dim=*/0, dim_size - merged_dim, /*offset=*/0);
+    } else {
+      rt = target_data_contiguous(Device, args_base[i], args[i], arg_sizes[i],
+                                  arg_types[i]);
     }
+    if (rt == OFFLOAD_FAIL)
+      return OFFLOAD_FAIL;
   }
   return OFFLOAD_SUCCESS;
 }

diff  --git a/openmp/libomptarget/test/offloading/non_contiguous_update.cpp b/openmp/libomptarget/test/offloading/non_contiguous_update.cpp
new file mode 100644
index 000000000000..0c1a809887a0
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/non_contiguous_update.cpp
@@ -0,0 +1,100 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=DEBUG
+// REQUIRES: libomptarget-debug
+
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+
+// Data structure definitions copied from OpenMP RTL.
+struct __tgt_target_non_contig {
+  int64_t offset;
+  int64_t width;
+  int64_t stride;
+};
+
+enum tgt_map_type {
+  OMP_TGT_MAPTYPE_NON_CONTIG      = 0x100000000000
+};
+
+// OpenMP RTL interfaces
+#ifdef __cplusplus
+extern "C" {
+#endif
+void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+                              void **args_base, void **args, int64_t *arg_sizes,
+                              int64_t *arg_types);
+#ifdef __cplusplus
+}
+#endif
+
+int main() {
+  // case 1
+  // int arr[3][4][5][6];
+  // #pragma omp target update to(arr[0:2][1:3][1:2][:])
+  // set up descriptor
+  __tgt_target_non_contig non_contig[5] = {
+      {0, 2, 480}, {1, 3, 120}, {1, 2, 24}, {0, 6, 4}, {0, 1, 4}};
+  int64_t size = 4, type = OMP_TGT_MAPTYPE_NON_CONTIG;
+
+  void *base;
+  void *begin = &non_contig;
+  int64_t *sizes = &size;
+  int64_t *types = &type;
+
+  // The below diagram is the visualization of the non-contiguous transfer after
+  // optimization. Note that each element represent the innermost dimension
+  // (unit size = 24) since the stride * count of last dimension is equal to the
+  // stride of second last dimension.
+  //
+  // OOOOO OOOOO OOOOO
+  // OXXOO OXXOO OOOOO
+  // OXXOO OXXOO OOOOO
+  // OXXOO OXXOO OOOOO
+  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
+                           sizes, types);
+  // DEBUG: offset 144
+  // DEBUG: offset 264
+  // DEBUG: offset 384
+  // DEBUG: offset 624
+  // DEBUG: offset 744
+  // DEBUG: offset 864
+
+
+  // case 2
+  // double darr[3][4][5];
+  // #pragma omp target update to(darr[0:2:2][2:2][:2:2])
+  // set up descriptor
+  __tgt_target_non_contig non_contig_2[4] = {
+      {0, 2, 320}, {2, 2, 40}, {0, 2, 16}, {0, 1, 8}};
+  int64_t size_2 = 4, type_2 = OMP_TGT_MAPTYPE_NON_CONTIG;
+
+  void *base_2;
+  void *begin_2 = &non_contig_2;
+  int64_t *sizes_2 = &size_2;
+  int64_t *types_2 = &type_2;
+
+  // The below diagram is the visualization of the non-contiguous transfer after
+  // optimization. Note that each element represent the innermost dimension
+  // (unit size = 24) since the stride * count of last dimension is equal to the
+  // stride of second last dimension.
+  //
+  // OOOOO OOOOO OOOOO
+  // OOOOO OOOOO OOOOO
+  // XOXOO OOOOO XOXOO
+  // XOXOO OOOOO XOXOO
+  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base_2, &begin_2,
+                           sizes_2, types_2);
+  // DEBUG: offset 80
+  // DEBUG: offset 96
+  // DEBUG: offset 120
+  // DEBUG: offset 136
+  // DEBUG: offset 400
+  // DEBUG: offset 416
+  // DEBUG: offset 440
+  // DEBUG: offset 456
+  return 0;
+}


        


More information about the Openmp-commits mailing list