[Openmp-commits] [openmp] r337455 - [OpenMP][libomptarget] New map interface: remove translation code and ensure proper alignment of struct members

George Rokos via Openmp-commits openmp-commits at lists.llvm.org
Thu Jul 19 06:41:03 PDT 2018


Author: grokos
Date: Thu Jul 19 06:41:03 2018
New Revision: 337455

URL: http://llvm.org/viewvc/llvm-project?rev=337455&view=rev
Log:
[OpenMP][libomptarget] New map interface: remove translation code and ensure proper alignment of struct members

This patch removes the translation code since this functionality is now implemented in the compiler.
target_data_begin and target_data_end are also patched to handle some special cases that used to be
handled by the obsolete translation function, namely ensure proper alignment of struct members when
we have partially mapped structs. Mapping a struct from a higher address (i.e. not from its beginning)
can result in distortion of the alignment for some of its member fields. Padding restores the original
(proper) alignment.

Differential revision: https://reviews.llvm.org/D44186


Modified:
    openmp/trunk/libomptarget/include/omptarget.h
    openmp/trunk/libomptarget/src/interface.cpp
    openmp/trunk/libomptarget/src/omptarget.cpp

Modified: openmp/trunk/libomptarget/include/omptarget.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/include/omptarget.h?rev=337455&r1=337454&r2=337455&view=diff
==============================================================================
--- openmp/trunk/libomptarget/include/omptarget.h (original)
+++ openmp/trunk/libomptarget/include/omptarget.h Thu Jul 19 06:41:03 2018
@@ -48,7 +48,7 @@ enum tgt_map_type {
   OMP_TGT_MAPTYPE_LITERAL         = 0x100,
   // mapping is implicit
   OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
-  // member of struct, member given by 16 MSBs - 1
+  // member of struct, member given by [16 MSBs] - 1
   OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
 };
 

Modified: openmp/trunk/libomptarget/src/interface.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/src/interface.cpp?rev=337455&r1=337454&r2=337455&view=diff
==============================================================================
--- openmp/trunk/libomptarget/src/interface.cpp (original)
+++ openmp/trunk/libomptarget/src/interface.cpp Thu Jul 19 06:41:03 2018
@@ -33,265 +33,36 @@ EXTERN void __tgt_unregister_lib(__tgt_b
   RTLs.UnregisterLib(desc);
 }
 
-// Following datatypes and functions (tgt_oldmap_type, combined_entry_t,
-// translate_map, cleanup_map) will be removed once the compiler starts using
-// the new map types.
-
-// Old map types
-enum tgt_oldmap_type {
-  OMP_TGT_OLDMAPTYPE_TO          = 0x001, // copy data from host to device
-  OMP_TGT_OLDMAPTYPE_FROM        = 0x002, // copy data from device to host
-  OMP_TGT_OLDMAPTYPE_ALWAYS      = 0x004, // copy regardless of the ref. count
-  OMP_TGT_OLDMAPTYPE_DELETE      = 0x008, // force unmapping of data
-  OMP_TGT_OLDMAPTYPE_MAP_PTR     = 0x010, // map pointer as well as pointee
-  OMP_TGT_OLDMAPTYPE_FIRST_MAP   = 0x020, // first occurrence of mapped variable
-  OMP_TGT_OLDMAPTYPE_RETURN_PTR  = 0x040, // return TgtBase addr of mapped data
-  OMP_TGT_OLDMAPTYPE_PRIVATE_PTR = 0x080, // private variable - not mapped
-  OMP_TGT_OLDMAPTYPE_PRIVATE_VAL = 0x100  // copy by value - not mapped
-};
-
-// Temporary functions for map translation and cleanup
-struct combined_entry_t {
-  int num_members; // number of members in combined entry
-  void *base_addr; // base address of combined entry
-  void *begin_addr; // begin address of combined entry
-  void *end_addr; // size of combined entry
-};
-
-static void translate_map(int32_t arg_num, void **args_base, void **args,
-    int64_t *arg_sizes, int64_t *arg_types, int32_t &new_arg_num,
-    void **&new_args_base, void **&new_args, int64_t *&new_arg_sizes,
-    int64_t *&new_arg_types, bool is_target_construct) {
-  if (arg_num <= 0) {
-    DP("Nothing to translate\n");
-    new_arg_num = 0;
-    return;
-  }
-
-  // array of combined entries
-  combined_entry_t *cmb_entries =
-      (combined_entry_t *) alloca(arg_num * sizeof(combined_entry_t));
-  // number of combined entries
-  long num_combined = 0;
-  // old entry is MAP_PTR?
-  bool *is_ptr_old = (bool *) alloca(arg_num * sizeof(bool));
-  // old entry is member of member_of[old] cmb_entry
-  int *member_of = (int *) alloca(arg_num * sizeof(int));
-  // temporary storage for modifications of the original arg_types
-  int64_t *mod_arg_types = (int64_t *) alloca(arg_num  *sizeof(int64_t));
-
-  DP("Translating %d map entries\n", arg_num);
-  for (int i = 0; i < arg_num; ++i) {
-    member_of[i] = -1;
-    is_ptr_old[i] = false;
-    mod_arg_types[i] = arg_types[i];
-    // Scan previous entries to see whether this entry shares the same base
-    for (int j = 0; j < i; ++j) {
-      void *new_begin_addr = NULL;
-      void *new_end_addr = NULL;
-
-      if (mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_MAP_PTR) {
-        if (args_base[i] == args[j]) {
-          if (!(mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR)) {
-            DP("Entry %d has the same base as entry %d's begin address\n", i,
-                j);
-            new_begin_addr = args_base[i];
-            new_end_addr = (char *)args_base[i] + sizeof(void *);
-            assert(arg_sizes[j] == sizeof(void *));
-            is_ptr_old[j] = true;
-          } else {
-            DP("Entry %d has the same base as entry %d's begin address, but "
-                "%d's base was a MAP_PTR too\n", i, j, j);
-            int32_t to_from_always_delete =
-                OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM |
-                OMP_TGT_OLDMAPTYPE_ALWAYS | OMP_TGT_OLDMAPTYPE_DELETE;
-            if (mod_arg_types[j] & to_from_always_delete) {
-              DP("Resetting to/from/always/delete flags for entry %d because "
-                  "it is only a pointer to pointer\n", j);
-              mod_arg_types[j] &= ~to_from_always_delete;
-            }
-          }
-        }
-      } else {
-        if (!(mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_FIRST_MAP) &&
-            args_base[i] == args_base[j]) {
-          DP("Entry %d has the same base address as entry %d\n", i, j);
-          new_begin_addr = args[i];
-          new_end_addr = (char *)args[i] + arg_sizes[i];
-        }
-      }
-
-      // If we have combined the entry with a previous one
-      if (new_begin_addr) {
-        int id;
-        if(member_of[j] == -1) {
-          // We have a new entry
-          id = num_combined++;
-          DP("Creating new combined entry %d for old entry %d\n", id, j);
-          // Initialize new entry
-          cmb_entries[id].num_members = 1;
-          cmb_entries[id].base_addr = args_base[j];
-          if (mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR) {
-            cmb_entries[id].begin_addr = args_base[j];
-            cmb_entries[id].end_addr = (char *)args_base[j] + arg_sizes[j];
-          } else {
-            cmb_entries[id].begin_addr = args[j];
-            cmb_entries[id].end_addr = (char *)args[j] + arg_sizes[j];
-          }
-          member_of[j] = id;
-        } else {
-          // Reuse existing combined entry
-          DP("Reusing existing combined entry %d\n", member_of[j]);
-          id = member_of[j];
-        }
-
-        // Update combined entry
-        DP("Adding entry %d to combined entry %d\n", i, id);
-        cmb_entries[id].num_members++;
-        // base_addr stays the same
-        cmb_entries[id].begin_addr =
-            std::min(cmb_entries[id].begin_addr, new_begin_addr);
-        cmb_entries[id].end_addr =
-            std::max(cmb_entries[id].end_addr, new_end_addr);
-        member_of[i] = id;
-        break;
-      }
-    }
-  }
-
-  DP("New entries: %ld combined + %d original\n", num_combined, arg_num);
-  new_arg_num = arg_num + num_combined;
-  new_args_base = (void **) malloc(new_arg_num * sizeof(void *));
-  new_args = (void **) malloc(new_arg_num * sizeof(void *));
-  new_arg_sizes = (int64_t *) malloc(new_arg_num * sizeof(int64_t));
-  new_arg_types = (int64_t *) malloc(new_arg_num * sizeof(int64_t));
-
-  const int64_t alignment = 8;
-
-  int next_id = 0; // next ID
-  int next_cid = 0; // next combined ID
-  int *combined_to_new_id = (int *) alloca(num_combined * sizeof(int));
-  for (int i = 0; i < arg_num; ++i) {
-    // It is member_of
-    if (member_of[i] == next_cid) {
-      int cid = next_cid++; // ID of this combined entry
-      int nid = next_id++; // ID of the new (global) entry
-      combined_to_new_id[cid] = nid;
-      DP("Combined entry %3d will become new entry %3d\n", cid, nid);
-
-      int64_t padding = (int64_t)cmb_entries[cid].begin_addr % alignment;
-      if (padding) {
-        DP("Using a padding of %" PRId64 " for begin address " DPxMOD "\n",
-            padding, DPxPTR(cmb_entries[cid].begin_addr));
-        cmb_entries[cid].begin_addr =
-            (char *)cmb_entries[cid].begin_addr - padding;
-      }
-
-      new_args_base[nid] = cmb_entries[cid].base_addr;
-      new_args[nid] = cmb_entries[cid].begin_addr;
-      new_arg_sizes[nid] = (int64_t) ((char *)cmb_entries[cid].end_addr -
-          (char *)cmb_entries[cid].begin_addr);
-      new_arg_types[nid] = OMP_TGT_MAPTYPE_TARGET_PARAM;
-      DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", "
-          "size %" PRId64 ", type 0x%" PRIx64 "\n", nid,
-          DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
-          new_arg_types[nid]);
-    } else if (member_of[i] != -1) {
-      DP("Combined entry %3d has been encountered before, do nothing\n",
-          member_of[i]);
-    }
-
-    // Now that the combined entry (the one the old entry was a member of) has
-    // been inserted into the new arguments list, proceed with the old entry.
-    int nid = next_id++;
-    DP("Old entry %3d will become new entry %3d\n", i, nid);
-
-    new_args_base[nid] = args_base[i];
-    new_args[nid] = args[i];
-    new_arg_sizes[nid] = arg_sizes[i];
-    int64_t old_type = mod_arg_types[i];
-
-    if (is_ptr_old[i]) {
-      // Reset TO and FROM flags
-      old_type &= ~(OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM);
-    }
-
-    if (member_of[i] == -1) {
-      if (!is_target_construct)
-        old_type &= ~OMP_TGT_MAPTYPE_TARGET_PARAM;
-      new_arg_types[nid] = old_type;
-      DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64
-          ", type 0x%" PRIx64 " (old entry %d not MEMBER_OF)\n", nid,
-          DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
-          new_arg_types[nid], i);
-    } else {
-      // Old entry is not FIRST_MAP
-      old_type &= ~OMP_TGT_OLDMAPTYPE_FIRST_MAP;
-      // Add MEMBER_OF
-      int new_member_of = combined_to_new_id[member_of[i]];
-      old_type |= ((int64_t)new_member_of + 1) << 48;
-      new_arg_types[nid] = old_type;
-      DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64
-        ", type 0x%" PRIx64 " (old entry %d MEMBER_OF %d)\n", nid,
-        DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
-        new_arg_types[nid], i, new_member_of);
-    }
-  }
-}
-
-static void cleanup_map(int32_t new_arg_num, void **new_args_base,
-    void **new_args, int64_t *new_arg_sizes, int64_t *new_arg_types,
-    int32_t arg_num, void **args_base) {
-  if (new_arg_num > 0) {
-    int offset = new_arg_num - arg_num;
-    for (int32_t i = 0; i < arg_num; ++i) {
-      // Restore old base address
-      args_base[i] = new_args_base[i+offset];
-    }
-    free(new_args_base);
-    free(new_args);
-    free(new_arg_sizes);
-    free(new_arg_types);
-  }
-}
-
 /// creates host-to-target data mapping, stores it in the
 /// libomptarget.so internal structure (an entry in a stack of data maps)
 /// and passes the data to the device.
 EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
-  DP("Entering data begin region for device %ld with %d mappings\n", device_id,
-     arg_num);
+  DP("Entering data begin region for device %" PRId64 " with %d mappings\n",
+      device_id, arg_num);
 
   // No devices available?
   if (device_id == OFFLOAD_DEVICE_DEFAULT) {
     device_id = omp_get_default_device();
-    DP("Use default device id %ld\n", device_id);
+    DP("Use default device id %" PRId64 "\n", device_id);
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %ld ready\n", device_id);
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
     return;
   }
 
   DeviceTy& Device = Devices[device_id];
 
-  // Translate maps
-  int32_t new_arg_num;
-  void **new_args_base;
-  void **new_args;
-  int64_t *new_arg_sizes;
-  int64_t *new_arg_types;
-  translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
-      new_args_base, new_args, new_arg_sizes, new_arg_types, false);
-
-  //target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types);
-  target_data_begin(Device, new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types);
-
-  // Cleanup translation memory
-  cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types, arg_num, args_base);
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types);
 }
 
 EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
@@ -321,32 +92,25 @@ EXTERN void __tgt_target_data_end(int64_
   size_t Devices_size = Devices.size();
   RTLsMtx.unlock();
   if (Devices_size <= (size_t)device_id) {
-    DP("Device ID  %ld does not have a matching RTL.\n", device_id);
+    DP("Device ID  %" PRId64 " does not have a matching RTL.\n", device_id);
     return;
   }
 
   DeviceTy &Device = Devices[device_id];
   if (!Device.IsInit) {
-    DP("uninit device: ignore");
+    DP("Uninit device: ignore");
     return;
   }
 
-  // Translate maps
-  int32_t new_arg_num;
-  void **new_args_base;
-  void **new_args;
-  int64_t *new_arg_sizes;
-  int64_t *new_arg_types;
-  translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
-      new_args_base, new_args, new_arg_sizes, new_arg_types, false);
-
-  //target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
-  target_data_end(Device, new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types);
-
-  // Cleanup translation memory
-  cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types, arg_num, args_base);
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
 }
 
 EXTERN void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
@@ -370,7 +134,7 @@ EXTERN void __tgt_target_data_update(int
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %ld ready\n", device_id);
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
     return;
   }
 
@@ -391,35 +155,28 @@ EXTERN void __tgt_target_data_update_now
 
 EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
-  DP("Entering target region with entry point " DPxMOD " and device Id %ld\n",
-     DPxPTR(host_ptr), device_id);
+  DP("Entering target region with entry point " DPxMOD " and device Id %"
+      PRId64 "\n", DPxPTR(host_ptr), device_id);
 
   if (device_id == OFFLOAD_DEVICE_DEFAULT) {
     device_id = omp_get_default_device();
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %ld ready\n", device_id);
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
     return OFFLOAD_FAIL;
   }
 
-  // Translate maps
-  int32_t new_arg_num;
-  void **new_args_base;
-  void **new_args;
-  int64_t *new_arg_sizes;
-  int64_t *new_arg_types;
-  translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
-      new_args_base, new_args, new_arg_sizes, new_arg_types, true);
-
-  //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
-  //    arg_types, 0, 0, false /*team*/, false /*recursive*/);
-  int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args,
-      new_arg_sizes, new_arg_types, 0, 0, false /*team*/);
-
-  // Cleanup translation memory
-  cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types, arg_num, args_base);
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+      arg_types, 0, 0, false /*team*/);
 
   return rc;
 }
@@ -438,36 +195,28 @@ EXTERN int __tgt_target_nowait(int64_t d
 EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr,
     int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
     int64_t *arg_types, int32_t team_num, int32_t thread_limit) {
-  DP("Entering target region with entry point " DPxMOD " and device Id %ld\n",
-     DPxPTR(host_ptr), device_id);
+  DP("Entering target region with entry point " DPxMOD " and device Id %"
+      PRId64 "\n", DPxPTR(host_ptr), device_id);
 
   if (device_id == OFFLOAD_DEVICE_DEFAULT) {
     device_id = omp_get_default_device();
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %ld ready\n", device_id);
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
     return OFFLOAD_FAIL;
   }
 
-  // Translate maps
-  int32_t new_arg_num;
-  void **new_args_base;
-  void **new_args;
-  int64_t *new_arg_sizes;
-  int64_t *new_arg_types;
-  translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
-      new_args_base, new_args, new_arg_sizes, new_arg_types, true);
-
-  //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
-  //              arg_types, team_num, thread_limit, true /*team*/,
-  //              false /*recursive*/);
-  int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args,
-      new_arg_sizes, new_arg_types, team_num, thread_limit, true /*team*/);
-
-  // Cleanup translation memory
-  cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
-      new_arg_types, arg_num, args_base);
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+      arg_types, team_num, thread_limit, true /*team*/);
 
   return rc;
 }
@@ -492,11 +241,11 @@ EXTERN void __kmpc_push_target_tripcount
   }
 
   if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %ld ready\n", device_id);
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
     return;
   }
 
-  DP("__kmpc_push_target_tripcount(%ld, %" PRIu64 ")\n", device_id,
+  DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id,
       loop_tripcount);
   Devices[device_id].loopTripCnt = loop_tripcount;
 }

Modified: openmp/trunk/libomptarget/src/omptarget.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/src/omptarget.cpp?rev=337455&r1=337454&r2=337455&view=diff
==============================================================================
--- openmp/trunk/libomptarget/src/omptarget.cpp (original)
+++ openmp/trunk/libomptarget/src/omptarget.cpp Thu Jul 19 06:41:03 2018
@@ -25,6 +25,38 @@
 int DebugLevel = 0;
 #endif // OMPTARGET_DEBUG
 
+/* All begin addresses for partially mapped structs must be 8-aligned in order
+ * to ensure proper alignment of members. E.g.
+ *
+ * struct S {
+ *   int a;   // 4-aligned
+ *   int b;   // 4-aligned
+ *   int *p;  // 8-aligned
+ * } s1;
+ * ...
+ * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
+ * {
+ *   s1.b = 5;
+ *   for (int i...) s1.p[i] = ...;
+ * }
+ *
+ * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
+ * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
+ * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
+ * requirements for its type. Now, when we allocate memory on the device, in
+ * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
+ * This means that the chunk of the struct on the device will start at a
+ * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
+ * address of p will be a misaligned 0x204 (on the host there was no need to add
+ * padding between b and p, so p comes exactly 4 bytes after b). If the device
+ * kernel tries to access s1.p, a misaligned address error occurs (as reported
+ * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
+ * extending the size of the allocated chuck accordingly, the chuck on the
+ * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
+ * &s1.p=0x208, as they should be to satisfy the alignment requirements.
+ */
+static const int64_t alignment = 8;
+
 /// Map global data and execute pending ctors
 static int InitLibrary(DeviceTy& Device) {
   /*
@@ -172,7 +204,7 @@ int CheckDeviceAndCtors(int64_t device_i
   return OFFLOAD_SUCCESS;
 }
 
-static short member_of(int64_t type) {
+static int32_t member_of(int64_t type) {
   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
 }
 
@@ -189,10 +221,33 @@ int target_data_begin(DeviceTy &Device,
 
     void *HstPtrBegin = args[i];
     void *HstPtrBase = args_base[i];
+    int64_t data_size = arg_sizes[i];
+
+    // Adjust for proper alignment if this is a combined entry (for structs).
+    // Look at the next argument - if that is MEMBER_OF this one, then this one
+    // is a combined entry.
+    int64_t padding = 0;
+    const int next_i = i+1;
+    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+        member_of(arg_types[next_i]) == i) {
+      padding = (int64_t)HstPtrBegin % alignment;
+      if (padding) {
+        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+            "\n", padding, DPxPTR(HstPtrBegin));
+        HstPtrBegin = (char *) HstPtrBegin - padding;
+        data_size += padding;
+      }
+    }
+
     // Address of pointer on the host and device, respectively.
     void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
     bool IsNew, Pointer_IsNew;
     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
+    // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
+    // have reached this point via __tgt_target_data_begin and not __tgt_target
+    // then no argument is marked as TARGET_PARAM ("omp target data map" is not
+    // associated with a target region, so there are no target parameters). This
+    // may be considered a hack, we could revise the scheme in the future.
     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
       DP("Has a pointer entry: \n");
@@ -213,28 +268,22 @@ int target_data_begin(DeviceTy &Device,
     }
 
     void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
-        arg_sizes[i], IsNew, IsImplicit, UpdateRef);
-    if (!TgtPtrBegin && arg_sizes[i]) {
-      // If arg_sizes[i]==0, then the argument is a pointer to NULL, so
-      // getOrAlloc() returning NULL is not an error.
+        data_size, IsNew, IsImplicit, UpdateRef);
+    if (!TgtPtrBegin && data_size) {
+      // If data_size==0, then the argument could be a zero-length pointer to
+      // NULL, so getOrAlloc() returning NULL is not an error.
       DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
           "illegal mapping).\n");
     }
     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
-        " - is%s new\n", arg_sizes[i], DPxPTR(TgtPtrBegin),
+        " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
         (IsNew ? "" : " not"));
 
     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
-      void *ret_ptr;
-      if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
-        ret_ptr = Pointer_TgtPtrBegin;
-      else {
-        bool IsLast; // not used
-        ret_ptr = Device.getTgtPtrBegin(HstPtrBegin, 0, IsLast, false);
-      }
-
-      DP("Returning device pointer " DPxMOD "\n", DPxPTR(ret_ptr));
-      args_base[i] = ret_ptr;
+      uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
+      void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
+      DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
+      args_base[i] = TgtPtrBase;
     }
 
     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
@@ -243,7 +292,7 @@ int target_data_begin(DeviceTy &Device,
         copy = true;
       } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
         // Copy data only if the "parent" struct has RefCount==1.
-        short parent_idx = member_of(arg_types[i]);
+        int32_t parent_idx = member_of(arg_types[i]);
         long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
         assert(parent_rc > 0 && "parent struct not found");
         if (parent_rc == 1) {
@@ -253,8 +302,8 @@ int target_data_begin(DeviceTy &Device,
 
       if (copy) {
         DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
-            arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
+            data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
         if (rt != OFFLOAD_SUCCESS) {
           DP("Copying data to device failed.\n");
           rc = OFFLOAD_FAIL;
@@ -297,16 +346,33 @@ int target_data_end(DeviceTy &Device, in
       continue;
 
     void *HstPtrBegin = args[i];
+    int64_t data_size = arg_sizes[i];
+    // Adjust for proper alignment if this is a combined entry (for structs).
+    // Look at the next argument - if that is MEMBER_OF this one, then this one
+    // is a combined entry.
+    int64_t padding = 0;
+    const int next_i = i+1;
+    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+        member_of(arg_types[next_i]) == i) {
+      padding = (int64_t)HstPtrBegin % alignment;
+      if (padding) {
+        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+            "\n", padding, DPxPTR(HstPtrBegin));
+        HstPtrBegin = (char *) HstPtrBegin - padding;
+        data_size += padding;
+      }
+    }
+
     bool IsLast;
     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
         (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
     bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
 
     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
-    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
+    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
         UpdateRef);
     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
-        " - is%s last\n", arg_sizes[i], DPxPTR(TgtPtrBegin),
+        " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
         (IsLast ? "" : " not"));
 
     bool DelEntry = IsLast || ForceDelete;
@@ -324,7 +390,7 @@ int target_data_end(DeviceTy &Device, in
         if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
             !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
           // Copy data only if the "parent" struct has RefCount==1.
-          short parent_idx = member_of(arg_types[i]);
+          int32_t parent_idx = member_of(arg_types[i]);
           long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
           assert(parent_rc > 0 && "parent struct not found");
           if (parent_rc == 1) {
@@ -334,8 +400,8 @@ int target_data_end(DeviceTy &Device, in
 
         if (DelEntry || Always || CopyMember) {
           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
-              arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-          int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]);
+              data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+          int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
           if (rt != OFFLOAD_SUCCESS) {
             DP("Copying data from device failed.\n");
             rc = OFFLOAD_FAIL;
@@ -348,7 +414,7 @@ int target_data_end(DeviceTy &Device, in
       // copies. If the struct is going to be deallocated, remove any remaining
       // shadow pointer entries for this struct.
       uintptr_t lb = (uintptr_t) HstPtrBegin;
-      uintptr_t ub = (uintptr_t) HstPtrBegin + arg_sizes[i];
+      uintptr_t ub = (uintptr_t) HstPtrBegin + data_size;
       Device.ShadowMtx.lock();
       for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
           it != Device.ShadowPtrMap.end(); ++it) {
@@ -378,7 +444,7 @@ int target_data_end(DeviceTy &Device, in
 
       // Deallocate map
       if (DelEntry) {
-        int rt = Device.deallocTgtPtr(HstPtrBegin, arg_sizes[i], ForceDelete);
+        int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete);
         if (rt != OFFLOAD_SUCCESS) {
           DP("Deallocating data from device failed.\n");
           rc = OFFLOAD_FAIL;




More information about the Openmp-commits mailing list