[Openmp-commits] [openmp] d9b1d82 - [NFC][OpenMP] Prepare amdgpu plugin for asynchronous implementation of target region launch

Tue Dec 7 13:03:10 PST 2021

Author: Carlo Bertolli
Date: 2021-12-07T21:02:45Z
New Revision: d9b1d827d2e9ae135901b6eccf25a05ef49f38af

URL: https://github.com/llvm/llvm-project/commit/d9b1d827d2e9ae135901b6eccf25a05ef49f38af
DIFF: https://github.com/llvm/llvm-project/commit/d9b1d827d2e9ae135901b6eccf25a05ef49f38af.diff

LOG: [NFC][OpenMP] Prepare amdgpu plugin for asynchronous implementation of target region launch

At present, amdgpu plugin merges both asynchronous and synchronous kernel launch implementations into a single synchronous version.
This patch prepares the plugin for asynchronous implementation by:
- Privatizing actual kernel launch code (valid in both cases) into an anonymous namespace base function

Actual separation of kernel launch code (async vs sync) is a following patch.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D115267

Added: 
    

Modified: 
    openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 45d94765936ab..5434692d0119d 100644

--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -883,113 +883,473 @@ void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
   AsyncInfo->Queue = 0;
 }
 
-bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
-  const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
-  int32_t r = elf_check_machine(image, amdgcnMachineID);
-  if (!r) {
-    DP("Supported machine ID not found\n");
-  }
-  return r;
-}
+// Determine launch values for kernel.
+struct launchVals {
+  int WorkgroupSize;
+  int GridSize;
+};
+launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
+                         int ConstWGSize,
+                         llvm::omp::OMPTgtExecModeFlags ExecutionMode,
+                         int num_teams, int thread_limit,
+                         uint64_t loop_tripcount, int DeviceNumTeams) {
 
-uint32_t elf_e_flags(__tgt_device_image *image) {
-  char *img_begin = (char *)image->ImageStart;
-  size_t img_size = (char *)image->ImageEnd - img_begin;
+  int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
+  int num_groups = 0;
 
-  Elf *e = elf_memory(img_begin, img_size);
-  if (!e) {
-    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
-    return 0;
+  int Max_Teams =
+      Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
+  if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit)
+    Max_Teams = RTLDeviceInfoTy::HardTeamLimit;
+
+  if (print_kernel_trace & STARTUP_DETAILS) {
+    DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams);
+    DP("Max_Teams: %d\n", Max_Teams);
+    DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
+    DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size);
+    DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
+       RTLDeviceInfoTy::Default_WG_Size);
+    DP("thread_limit: %d\n", thread_limit);
+    DP("threadsPerGroup: %d\n", threadsPerGroup);
+    DP("ConstWGSize: %d\n", ConstWGSize);
+  }
+  // check for thread_limit() clause
+  if (thread_limit > 0) {
+    threadsPerGroup = thread_limit;
+    DP("Setting threads per block to requested %d\n", thread_limit);
+    // Add master warp for GENERIC
+    if (ExecutionMode ==
+        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
+      threadsPerGroup += WarpSize;
+      DP("Adding master wavefront: +%d threads\n", WarpSize);
+    }
+    if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
+      threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
+      DP("Setting threads per block to maximum %d\n", threadsPerGroup);
+    }
+  }
+  // check flat_max_work_group_size attr here
+  if (threadsPerGroup > ConstWGSize) {
+    threadsPerGroup = ConstWGSize;
+    DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
+       threadsPerGroup);
   }
+  if (print_kernel_trace & STARTUP_DETAILS)
+    DP("threadsPerGroup: %d\n", threadsPerGroup);
+  DP("Preparing %d threads\n", threadsPerGroup);
 
-  Elf64_Ehdr *eh64 = elf64_getehdr(e);
+  // Set default num_groups (teams)
+  if (Env.TeamLimit > 0)
+    num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit;
+  else
+    num_groups = Max_Teams;
+  DP("Set default num of groups %d\n", num_groups);
 
-  if (!eh64) {
-    DP("Unable to get machine ID from ELF file!\n");
-    elf_end(e);
-    return 0;
+  if (print_kernel_trace & STARTUP_DETAILS) {
+    DP("num_groups: %d\n", num_groups);
+    DP("num_teams: %d\n", num_teams);
   }
 
-  uint32_t Flags = eh64->e_flags;
-
-  elf_end(e);
-  DP("ELF Flags: 0x%x\n", Flags);
-  return Flags;
-}
-} // namespace
+  // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
+  // This reduction is typical for default case (no thread_limit clause).
+  // or when user goes crazy with num_teams clause.
+  // FIXME: We cant distinguish between a constant or variable thread limit.
+  // So we only handle constant thread_limits.
+  if (threadsPerGroup >
+      RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
+    // Should we round threadsPerGroup up to nearest WarpSize
+    // here?
+    num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
 
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
-  return elf_machine_id_is_amdgcn(image);
-}
+  // check for num_teams() clause
+  if (num_teams > 0) {
+    num_groups = (num_teams < num_groups) ? num_teams : num_groups;
+  }
+  if (print_kernel_trace & STARTUP_DETAILS) {
+    DP("num_groups: %d\n", num_groups);
+    DP("Env.NumTeams %d\n", Env.NumTeams);
+    DP("Env.TeamLimit %d\n", Env.TeamLimit);
+  }
 
-int __tgt_rtl_number_of_devices() {
-  // If the construction failed, no methods are safe to call
-  if (DeviceInfo.ConstructionSucceeded) {
-    return DeviceInfo.NumberOfDevices;
+  if (Env.NumTeams > 0) {
+    num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups;
+    DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
+  } else if (Env.TeamLimit > 0) {
+    num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups;
+    DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
   } else {
-    DP("AMDGPU plugin construction failed. Zero devices available\n");
-    return 0;
+    if (num_teams <= 0) {
+      if (loop_tripcount > 0) {
+        if (ExecutionMode ==
+            llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
+          // round up to the nearest integer
+          num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
+        } else if (ExecutionMode ==
+                   llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
+          num_groups = loop_tripcount;
+        } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
+          // This is a generic kernel that was transformed to use SPMD-mode
+          // execution but uses Generic-mode semantics for scheduling.
+          num_groups = loop_tripcount;
+        }
+        DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
+           "threads per block %d\n",
+           num_groups, loop_tripcount, threadsPerGroup);
+      }
+    } else {
+      num_groups = num_teams;
+    }
+    if (num_groups > Max_Teams) {
+      num_groups = Max_Teams;
+      if (print_kernel_trace & STARTUP_DETAILS)
+        DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams);
+    }
+    if (num_groups > num_teams && num_teams > 0) {
+      num_groups = num_teams;
+      if (print_kernel_trace & STARTUP_DETAILS)
+        DP("Limiting num_groups %d to clause num_teams %d \n", num_groups,
+           num_teams);
+    }
   }
-}
 
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  DP("Init requires flags to %ld\n", RequiresFlags);
-  DeviceInfo.RequiresFlags = RequiresFlags;
-  return RequiresFlags;
+  // num_teams clause always honored, no matter what, unless DEFAULT is active.
+  if (num_teams > 0) {
+    num_groups = num_teams;
+    // Cap num_groups to EnvMaxTeamsDefault if set.
+    if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault)
+      num_groups = Env.MaxTeamsDefault;
+  }
+  if (print_kernel_trace & STARTUP_DETAILS) {
+    DP("threadsPerGroup: %d\n", threadsPerGroup);
+    DP("num_groups: %d\n", num_groups);
+    DP("loop_tripcount: %ld\n", loop_tripcount);
+  }
+  DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
+     threadsPerGroup);
+
+  launchVals res;
+  res.WorkgroupSize = threadsPerGroup;
+  res.GridSize = threadsPerGroup * num_groups;
+  return res;
 }
 
-namespace {
-template <typename T> bool enforce_upper_bound(T *value, T upper) {
-  bool changed = *value > upper;
-  if (changed) {
-    *value = upper;
+static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
+  uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
+  bool full = true;
+  while (full) {
+    full =
+        packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue));
   }
-  return changed;
+  return packet_id;
 }
-} // namespace
 
-int32_t __tgt_rtl_init_device(int device_id) {
-  hsa_status_t err;
+int32_t __tgt_rtl_run_target_team_region_locked(
+    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+    ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t num_teams,
+    int32_t thread_limit, uint64_t loop_tripcount) {
+  // Set the context we are using
+  // update thread limit content in gpu memory if un-initialized or specified
+  // from host
 
-  // this is per device id init
-  DP("Initialize the device id: %d\n", device_id);
+  DP("Run target team region thread_limit %d\n", thread_limit);
 
-  hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
+  // All args are references.
+  std::vector<void *> args(arg_num);
+  std::vector<void *> ptrs(arg_num);
 
-  // Get number of Compute Unit
-  uint32_t compute_units = 0;
-  err = hsa_agent_get_info(
-      agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
-      &compute_units);
-  if (err != HSA_STATUS_SUCCESS) {
-    DeviceInfo.ComputeUnits[device_id] = 1;
-    DP("Error getting compute units : settiing to 1\n");
-  } else {
-    DeviceInfo.ComputeUnits[device_id] = compute_units;
-    DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
+  DP("Arg_num: %d\n", arg_num);
+  for (int32_t i = 0; i < arg_num; ++i) {
+    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+    args[i] = &ptrs[i];
+    DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
   }
 
-  char GetInfoName[64]; // 64 max size returned by get info
-  err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
-                           (void *)GetInfoName);
-  if (err)
-    DeviceInfo.GPUName[device_id] = "--unknown gpu--";
-  else {
-    DeviceInfo.GPUName[device_id] = GetInfoName;
+  KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+
+  std::string kernel_name = std::string(KernelInfo->Name);
+  auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
+  if (KernelInfoTable[device_id].find(kernel_name) ==
+      KernelInfoTable[device_id].end()) {
+    DP("Kernel %s not found\n", kernel_name.c_str());
+    return OFFLOAD_FAIL;
   }
 
-  if (print_kernel_trace & STARTUP_DETAILS)
-    DP("Device#%-2d CU's: %2d %s\n", device_id,
-       DeviceInfo.ComputeUnits[device_id],
-       DeviceInfo.GPUName[device_id].c_str());
+  const atl_kernel_info_t KernelInfoEntry =
+      KernelInfoTable[device_id][kernel_name];
+  const uint32_t group_segment_size = KernelInfoEntry.group_segment_size;
+  const uint32_t sgpr_count = KernelInfoEntry.sgpr_count;
+  const uint32_t vgpr_count = KernelInfoEntry.vgpr_count;
+  const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
+  const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count;
 
-  // Query attributes to determine number of threads/block and blocks/grid.
-  uint16_t workgroup_max_dim[3];
-  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
-                           &workgroup_max_dim);
-  if (err != HSA_STATUS_SUCCESS) {
-    DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
-    DP("Error getting grid dims: num groups : %d\n",
+  assert(arg_num == (int)KernelInfoEntry.explicit_argument_count);
+
+  /*
+   * Set limit based on ThreadsPerGroup and GroupsPerDevice
+   */
+  launchVals LV =
+      getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env,
+                    KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
+                    num_teams,      // From run_region arg
+                    thread_limit,   // From run_region arg
+                    loop_tripcount, // From run_region arg
+                    DeviceInfo.NumTeams[KernelInfo->device_id]);
+  const int GridSize = LV.GridSize;
+  const int WorkgroupSize = LV.WorkgroupSize;
+
+  if (print_kernel_trace >= LAUNCH) {
+    int num_groups = GridSize / WorkgroupSize;
+    // enum modes are SPMD, GENERIC, NONE 0,1,2
+    // if doing rtl timing, print to stderr, unless stdout requested.
+    bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
+    fprintf(traceToStdout ? stdout : stderr,
+            "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
+            "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
+            "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
+            device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
+            arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,
+            group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,
+            vgpr_spill_count, loop_tripcount, KernelInfo->Name);
+  }
+
+  // Run on the device.
+  {
+    hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
+    if (!queue) {
+      return OFFLOAD_FAIL;
+    }
+    uint64_t packet_id = acquire_available_packet_id(queue);
+
+    const uint32_t mask = queue->size - 1; // size is a power of 2
+    hsa_kernel_dispatch_packet_t *packet =
+        (hsa_kernel_dispatch_packet_t *)queue->base_address +
+        (packet_id & mask);
+
+    // packet->header is written last
+    packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    packet->workgroup_size_x = WorkgroupSize;
+    packet->workgroup_size_y = 1;
+    packet->workgroup_size_z = 1;
+    packet->reserved0 = 0;
+    packet->grid_size_x = GridSize;
+    packet->grid_size_y = 1;
+    packet->grid_size_z = 1;
+    packet->private_segment_size = KernelInfoEntry.private_segment_size;
+    packet->group_segment_size = KernelInfoEntry.group_segment_size;
+    packet->kernel_object = KernelInfoEntry.kernel_object;
+    packet->kernarg_address = 0;     // use the block allocator
+    packet->reserved2 = 0;           // impl writes id_ here
+    packet->completion_signal = {0}; // may want a pool of signals
+
+    KernelArgPool *ArgPool = nullptr;
+    void *kernarg = nullptr;
+    {
+      auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
+      if (it != KernelArgPoolMap.end()) {
+        ArgPool = (it->second).get();
+      }
+    }
+    if (!ArgPool) {
+      DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
+         device_id);
+    }
+    {
+      if (ArgPool) {
+        assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
+        kernarg = ArgPool->allocate(arg_num);
+      }
+      if (!kernarg) {
+        DP("Allocate kernarg failed\n");
+        return OFFLOAD_FAIL;
+      }
+
+      // Copy explicit arguments
+      for (int i = 0; i < arg_num; i++) {
+        memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
+      }
+
+      // Initialize implicit arguments. TODO: Which of these can be dropped
+      impl_implicit_args_t *impl_args =
+          reinterpret_cast<impl_implicit_args_t *>(
+              static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
+      memset(impl_args, 0,
+             sizeof(impl_implicit_args_t)); // may not be necessary
+      impl_args->offset_x = 0;
+      impl_args->offset_y = 0;
+      impl_args->offset_z = 0;
+
+      // assign a hostcall buffer for the selected Q
+      if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) {
+        // hostrpc_assign_buffer is not thread safe, and this function is
+        // under a multiple reader lock, not a writer lock.
+        static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
+        pthread_mutex_lock(&hostcall_init_lock);
+        unsigned long buffer = hostrpc_assign_buffer(
+            DeviceInfo.HSAAgents[device_id], queue, device_id);
+        pthread_mutex_unlock(&hostcall_init_lock);
+        if (!buffer) {
+          DP("hostrpc_assign_buffer failed, gpu would dereference null and "
+             "error\n");
+          return OFFLOAD_FAIL;
+        }
+
+        if (KernelInfoEntry.implicit_argument_count >= 4) {
+          // Initialise pointer for implicit_argument_count != 0 ABI
+          // Guess that the right implicit argument is at offset 24 after
+          // the explicit arguments. In the future, should be able to read
+          // the offset from msgpack. Clang is not annotating it at present.
+          uint64_t Offset =
+              sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
+          if ((Offset + 8) > (ArgPool->kernarg_segment_size)) {
+            DP("Bad offset of hostcall, exceeds kernarg segment size\n");
+          } else {
+            memcpy(static_cast<char *>(kernarg) + Offset, &buffer, 8);
+          }
+        }
+
+        // initialise pointer for implicit_argument_count == 0 ABI
+        impl_args->hostcall_ptr = buffer;
+      }
+
+      packet->kernarg_address = kernarg;
+    }
+
+    hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
+    if (s.handle == 0) {
+      DP("Failed to get signal instance\n");
+      return OFFLOAD_FAIL;
+    }
+    packet->completion_signal = s;
+    hsa_signal_store_relaxed(packet->completion_signal, 1);
+
+    // Publish the packet indicating it is ready to be processed
+    core::packet_store_release(reinterpret_cast<uint32_t *>(packet),
+                               core::create_header(), packet->setup);
+
+    // Since the packet is already published, its contents must not be
+    // accessed any more
+    hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+
+    while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
+                                     HSA_WAIT_STATE_BLOCKED) != 0)
+      ;
+
+    assert(ArgPool);
+    ArgPool->deallocate(kernarg);
+    DeviceInfo.FreeSignalPool.push(s);
+  }
+
+  DP("Kernel completed\n");
+  return OFFLOAD_SUCCESS;
+}
+
+bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
+  const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
+  int32_t r = elf_check_machine(image, amdgcnMachineID);
+  if (!r) {
+    DP("Supported machine ID not found\n");
+  }
+  return r;
+}
+
+uint32_t elf_e_flags(__tgt_device_image *image) {
+  char *img_begin = (char *)image->ImageStart;
+  size_t img_size = (char *)image->ImageEnd - img_begin;
+
+  Elf *e = elf_memory(img_begin, img_size);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return 0;
+  }
+
+  Elf64_Ehdr *eh64 = elf64_getehdr(e);
+
+  if (!eh64) {
+    DP("Unable to get machine ID from ELF file!\n");
+    elf_end(e);
+    return 0;
+  }
+
+  uint32_t Flags = eh64->e_flags;
+
+  elf_end(e);
+  DP("ELF Flags: 0x%x\n", Flags);
+  return Flags;
+}
+} // namespace
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+  return elf_machine_id_is_amdgcn(image);
+}
+
+int __tgt_rtl_number_of_devices() {
+  // If the construction failed, no methods are safe to call
+  if (DeviceInfo.ConstructionSucceeded) {
+    return DeviceInfo.NumberOfDevices;
+  } else {
+    DP("AMDGPU plugin construction failed. Zero devices available\n");
+    return 0;
+  }
+}
+
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+  DP("Init requires flags to %ld\n", RequiresFlags);
+  DeviceInfo.RequiresFlags = RequiresFlags;
+  return RequiresFlags;
+}
+
+namespace {
+template <typename T> bool enforce_upper_bound(T *value, T upper) {
+  bool changed = *value > upper;
+  if (changed) {
+    *value = upper;
+  }
+  return changed;
+}
+} // namespace
+
+int32_t __tgt_rtl_init_device(int device_id) {
+  hsa_status_t err;
+
+  // this is per device id init
+  DP("Initialize the device id: %d\n", device_id);
+
+  hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
+
+  // Get number of Compute Unit
+  uint32_t compute_units = 0;
+  err = hsa_agent_get_info(
+      agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
+      &compute_units);
+  if (err != HSA_STATUS_SUCCESS) {
+    DeviceInfo.ComputeUnits[device_id] = 1;
+    DP("Error getting compute units : settiing to 1\n");
+  } else {
+    DeviceInfo.ComputeUnits[device_id] = compute_units;
+    DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
+  }
+
+  char GetInfoName[64]; // 64 max size returned by get info
+  err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
+                           (void *)GetInfoName);
+  if (err)
+    DeviceInfo.GPUName[device_id] = "--unknown gpu--";
+  else {
+    DeviceInfo.GPUName[device_id] = GetInfoName;
+  }
+
+  if (print_kernel_trace & STARTUP_DETAILS)
+    DP("Device#%-2d CU's: %2d %s\n", device_id,
+       DeviceInfo.ComputeUnits[device_id],
+       DeviceInfo.GPUName[device_id].c_str());
+
+  // Query attributes to determine number of threads/block and blocks/grid.
+  uint16_t workgroup_max_dim[3];
+  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
+                           &workgroup_max_dim);
+  if (err != HSA_STATUS_SUCCESS) {
+    DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+    DP("Error getting grid dims: num groups : %d\n",
        RTLDeviceInfoTy::DefaultNumTeams);
   } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
     DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0];
@@ -1469,769 +1829,404 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
       const char *DeviceName = DeviceInfo.GPUName[device_id].c_str();
       const char *ElfName = get_elf_mach_gfx_name(elf_e_flags(image));
 
-      if (strcmp(DeviceName, ElfName) != 0) {
-        DP("Possible gpu arch mismatch: device:%s, image:%s please check"
-           " compiler flag: -march=<gpu>\n",
-           DeviceName, ElfName);
-      } else {
-        DP("Error loading image onto GPU: %s\n", get_error_string(err));
-      }
-
-      return NULL;
-    }
-
-    err = env.after_loading();
-    if (err != HSA_STATUS_SUCCESS) {
-      return NULL;
-    }
-  }
-
-  DP("AMDGPU module successfully loaded!\n");
-
-  {
-    // the device_State array is either large value in bss or a void* that
-    // needs to be assigned to a pointer to an array of size device_state_bytes
-    // If absent, it has been deadstripped and needs no setup.
-
-    void *state_ptr;
-    uint32_t state_ptr_size;
-    auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
-    hsa_status_t err = interop_hsa_get_symbol_info(
-        SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr,
-        &state_ptr_size);
-
-    if (err != HSA_STATUS_SUCCESS) {
-      DP("No device_state symbol found, skipping initialization\n");
-    } else {
-      if (state_ptr_size < sizeof(void *)) {
-        DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size,
-           sizeof(void *));
-        return NULL;
-      }
-
-      // if it's larger than a void*, assume it's a bss array and no further
-      // initialization is required. Only try to set up a pointer for
-      // sizeof(void*)
-      if (state_ptr_size == sizeof(void *)) {
-        uint64_t device_State_bytes =
-            get_device_State_bytes((char *)image->ImageStart, img_size);
-        if (device_State_bytes == 0) {
-          DP("Can't initialize device_State, missing size information\n");
-          return NULL;
-        }
-
-        auto &dss = DeviceInfo.deviceStateStore[device_id];
-        if (dss.first.get() == nullptr) {
-          assert(dss.second == 0);
-          void *ptr = NULL;
-          hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id);
-          if (err != HSA_STATUS_SUCCESS) {
-            DP("Failed to allocate device_state array\n");
-            return NULL;
-          }
-          dss = {
-              std::unique_ptr<void, RTLDeviceInfoTy::implFreePtrDeletor>{ptr},
-              device_State_bytes,
-          };
-        }
-
-        void *ptr = dss.first.get();
-        if (device_State_bytes != dss.second) {
-          DP("Inconsistent sizes of device_State unsupported\n");
-          return NULL;
-        }
-
-        // write ptr to device memory so it can be used by later kernels
-        err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
-                                                   sizeof(void *), device_id);
-        if (err != HSA_STATUS_SUCCESS) {
-          DP("memcpy install of state_ptr failed\n");
-          return NULL;
-        }
-      }
-    }
-  }
-
-  // Here, we take advantage of the data that is appended after img_end to get
-  // the symbols' name we need to load. This data consist of the host entries
-  // begin and end as well as the target name (see the offloading linker script
-  // creation in clang compiler).
-
-  // Find the symbols in the module by name. The name can be obtain by
-  // concatenating the host entry name with the target name
-
-  __tgt_offload_entry *HostBegin = image->EntriesBegin;
-  __tgt_offload_entry *HostEnd = image->EntriesEnd;
-
-  for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
-
-    if (!e->addr) {
-      // The host should have always something in the address to
-      // uniquely identify the target region.
-      DP("Analyzing host entry '<null>' (size = %lld)...\n",
-         (unsigned long long)e->size);
-      return NULL;
-    }
-
-    if (e->size) {
-      __tgt_offload_entry entry = *e;
-
-      void *varptr;
-      uint32_t varsize;
-
-      auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
-      hsa_status_t err = interop_hsa_get_symbol_info(
-          SymbolInfoMap, device_id, e->name, &varptr, &varsize);
-
-      if (err != HSA_STATUS_SUCCESS) {
-        // Inform the user what symbol prevented offloading
-        DP("Loading global '%s' (Failed)\n", e->name);
-        return NULL;
-      }
-
-      if (varsize != e->size) {
-        DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
-           varsize, e->size);
-        return NULL;
-      }
-
-      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-         DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
-      entry.addr = (void *)varptr;
-
-      DeviceInfo.addOffloadEntry(device_id, entry);
-
-      if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-          e->flags & OMP_DECLARE_TARGET_LINK) {
-        // If unified memory is present any target link variables
-        // can access host addresses directly. There is no longer a
-        // need for device copies.
-        err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
-                                                   sizeof(void *), device_id);
-        if (err != HSA_STATUS_SUCCESS)
-          DP("Error when copying USM\n");
-        DP("Copy linked variable host address (" DPxMOD ")"
-           "to device address (" DPxMOD ")\n",
-           DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
-      }
-
-      continue;
-    }
-
-    DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
-
-    // errors in kernarg_segment_size previously treated as = 0 (or as undef)
-    uint32_t kernarg_segment_size = 0;
-    auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id];
-    hsa_status_t err = HSA_STATUS_SUCCESS;
-    if (!e->name) {
-      err = HSA_STATUS_ERROR;
-    } else {
-      std::string kernelStr = std::string(e->name);
-      auto It = KernelInfoMap.find(kernelStr);
-      if (It != KernelInfoMap.end()) {
-        atl_kernel_info_t info = It->second;
-        kernarg_segment_size = info.kernel_segment_size;
-      } else {
-        err = HSA_STATUS_ERROR;
-      }
-    }
-
-    // default value GENERIC (in case symbol is missing from cubin file)
-    llvm::omp::OMPTgtExecModeFlags ExecModeVal =
-        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
-
-    // get flat group size if present, else Default_WG_Size
-    int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
-
-    // get Kernel Descriptor if present.
-    // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
-    struct KernDescValType {
-      uint16_t Version;
-      uint16_t TSize;
-      uint16_t WG_Size;
-    };
-    struct KernDescValType KernDescVal;
-    std::string KernDescNameStr(e->name);
-    KernDescNameStr += "_kern_desc";
-    const char *KernDescName = KernDescNameStr.c_str();
-
-    void *KernDescPtr;
-    uint32_t KernDescSize;
-    void *CallStackAddr = nullptr;
-    err = interop_get_symbol_info((char *)image->ImageStart, img_size,
-                                  KernDescName, &KernDescPtr, &KernDescSize);
-
-    if (err == HSA_STATUS_SUCCESS) {
-      if ((size_t)KernDescSize != sizeof(KernDescVal))
-        DP("Loading global computation properties '%s' - size mismatch (%u != "
-           "%lu)\n",
-           KernDescName, KernDescSize, sizeof(KernDescVal));
-
-      memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
-
-      // Check structure size against recorded size.
-      if ((size_t)KernDescSize != KernDescVal.TSize)
-        DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
-           sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
-
-      DP("After loading global for %s KernDesc \n", KernDescName);
-      DP("KernDesc: Version: %d\n", KernDescVal.Version);
-      DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
-      DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
-
-      if (KernDescVal.WG_Size == 0) {
-        KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
-        DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
-      }
-      WGSizeVal = KernDescVal.WG_Size;
-      DP("WGSizeVal %d\n", WGSizeVal);
-      check("Loading KernDesc computation property", err);
-    } else {
-      DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
-
-      // Flat group size
-      std::string WGSizeNameStr(e->name);
-      WGSizeNameStr += "_wg_size";
-      const char *WGSizeName = WGSizeNameStr.c_str();
-
-      void *WGSizePtr;
-      uint32_t WGSize;
-      err = interop_get_symbol_info((char *)image->ImageStart, img_size,
-                                    WGSizeName, &WGSizePtr, &WGSize);
-
-      if (err == HSA_STATUS_SUCCESS) {
-        if ((size_t)WGSize != sizeof(int16_t)) {
-          DP("Loading global computation properties '%s' - size mismatch (%u "
-             "!= "
-             "%lu)\n",
-             WGSizeName, WGSize, sizeof(int16_t));
-          return NULL;
-        }
-
-        memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
-
-        DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
-
-        if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
-            WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
-          DP("Error wrong WGSize value specified in HSA code object file: "
-             "%d\n",
-             WGSizeVal);
-          WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
-        }
+      if (strcmp(DeviceName, ElfName) != 0) {
+        DP("Possible gpu arch mismatch: device:%s, image:%s please check"
+           " compiler flag: -march=<gpu>\n",
+           DeviceName, ElfName);
       } else {
-        DP("Warning: Loading WGSize '%s' - symbol not found, "
-           "using default value %d\n",
-           WGSizeName, WGSizeVal);
+        DP("Error loading image onto GPU: %s\n", get_error_string(err));
       }
 
-      check("Loading WGSize computation property", err);
+      return NULL;
     }
 
-    // Read execution mode from global in binary
-    std::string ExecModeNameStr(e->name);
-    ExecModeNameStr += "_exec_mode";
-    const char *ExecModeName = ExecModeNameStr.c_str();
-
-    void *ExecModePtr;
-    uint32_t varsize;
-    err = interop_get_symbol_info((char *)image->ImageStart, img_size,
-                                  ExecModeName, &ExecModePtr, &varsize);
+    err = env.after_loading();
+    if (err != HSA_STATUS_SUCCESS) {
+      return NULL;
+    }
+  }
 
-    if (err == HSA_STATUS_SUCCESS) {
-      if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
-        DP("Loading global computation properties '%s' - size mismatch(%u != "
-           "%lu)\n",
-           ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags));
-        return NULL;
-      }
+  DP("AMDGPU module successfully loaded!\n");
 
-      memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
+  {
+    // the device_State array is either large value in bss or a void* that
+    // needs to be assigned to a pointer to an array of size device_state_bytes
+    // If absent, it has been deadstripped and needs no setup.
 
-      DP("After loading global for %s ExecMode = %d\n", ExecModeName,
-         ExecModeVal);
+    void *state_ptr;
+    uint32_t state_ptr_size;
+    auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
+    hsa_status_t err = interop_hsa_get_symbol_info(
+        SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr,
+        &state_ptr_size);
 
-      if (ExecModeVal < 0 ||
-          ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
-        DP("Error wrong exec_mode value specified in HSA code object file: "
-           "%d\n",
-           ExecModeVal);
+    if (err != HSA_STATUS_SUCCESS) {
+      DP("No device_state symbol found, skipping initialization\n");
+    } else {
+      if (state_ptr_size < sizeof(void *)) {
+        DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size,
+           sizeof(void *));
         return NULL;
       }
-    } else {
-      DP("Loading global exec_mode '%s' - symbol missing, using default "
-         "value "
-         "GENERIC (1)\n",
-         ExecModeName);
-    }
-    check("Loading computation property", err);
-
-    KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
-                                   CallStackAddr, e->name, kernarg_segment_size,
-                                   DeviceInfo.KernArgPool));
-    __tgt_offload_entry entry = *e;
-    entry.addr = (void *)&KernelsList.back();
-    DeviceInfo.addOffloadEntry(device_id, entry);
-    DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
-  }
-
-  return DeviceInfo.getOffloadEntriesTable(device_id);
-}
-
-void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) {
-  void *ptr = NULL;
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-
-  if (kind != TARGET_ALLOC_DEFAULT) {
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-    return NULL;
-  }
 
-  hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(device_id);
-  hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, &ptr);
-  DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
-     (long long unsigned)(Elf64_Addr)ptr);
-  ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL;
-  return ptr;
-}
+      // if it's larger than a void*, assume it's a bss array and no further
+      // initialization is required. Only try to set up a pointer for
+      // sizeof(void*)
+      if (state_ptr_size == sizeof(void *)) {
+        uint64_t device_State_bytes =
+            get_device_State_bytes((char *)image->ImageStart, img_size);
+        if (device_State_bytes == 0) {
+          DP("Can't initialize device_State, missing size information\n");
+          return NULL;
+        }
 
-int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
-                              int64_t size) {
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-  __tgt_async_info AsyncInfo;
-  int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo);
-  if (rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
+        auto &dss = DeviceInfo.deviceStateStore[device_id];
+        if (dss.first.get() == nullptr) {
+          assert(dss.second == 0);
+          void *ptr = NULL;
+          hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id);
+          if (err != HSA_STATUS_SUCCESS) {
+            DP("Failed to allocate device_state array\n");
+            return NULL;
+          }
+          dss = {
+              std::unique_ptr<void, RTLDeviceInfoTy::implFreePtrDeletor>{ptr},
+              device_State_bytes,
+          };
+        }
 
-  return __tgt_rtl_synchronize(device_id, &AsyncInfo);
-}
+        void *ptr = dss.first.get();
+        if (device_State_bytes != dss.second) {
+          DP("Inconsistent sizes of device_State unsupported\n");
+          return NULL;
+        }
 
-int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
-                                    int64_t size, __tgt_async_info *AsyncInfo) {
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-  if (AsyncInfo) {
-    initAsyncInfo(AsyncInfo);
-    return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo);
-  } else {
-    return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
+        // write ptr to device memory so it can be used by later kernels
+        err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
+                                                   sizeof(void *), device_id);
+        if (err != HSA_STATUS_SUCCESS) {
+          DP("memcpy install of state_ptr failed\n");
+          return NULL;
+        }
+      }
+    }
   }
-}
 
-int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
-                                int64_t size) {
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-  __tgt_async_info AsyncInfo;
-  int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo);
-  if (rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
+  // Here, we take advantage of the data that is appended after img_end to get
+  // the symbols' name we need to load. This data consist of the host entries
+  // begin and end as well as the target name (see the offloading linker script
+  // creation in clang compiler).
 
-  return __tgt_rtl_synchronize(device_id, &AsyncInfo);
-}
+  // Find the symbols in the module by name. The name can be obtain by
+  // concatenating the host entry name with the target name
 
-int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
-                                      void *tgt_ptr, int64_t size,
-                                      __tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-  initAsyncInfo(AsyncInfo);
-  return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo);
-}
+  __tgt_offload_entry *HostBegin = image->EntriesBegin;
+  __tgt_offload_entry *HostEnd = image->EntriesEnd;
 
-int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-  hsa_status_t err;
-  DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
-  err = core::Runtime::Memfree(tgt_ptr);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error when freeing CUDA memory\n");
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
+  for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
 
-// Determine launch values for kernel.
-struct launchVals {
-  int WorkgroupSize;
-  int GridSize;
-};
-launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
-                         int ConstWGSize,
-                         llvm::omp::OMPTgtExecModeFlags ExecutionMode,
-                         int num_teams, int thread_limit,
-                         uint64_t loop_tripcount, int DeviceNumTeams) {
+    if (!e->addr) {
+      // The host should have always something in the address to
+      // uniquely identify the target region.
+      DP("Analyzing host entry '<null>' (size = %lld)...\n",
+         (unsigned long long)e->size);
+      return NULL;
+    }
 
-  int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
-  int num_groups = 0;
+    if (e->size) {
+      __tgt_offload_entry entry = *e;
 
-  int Max_Teams =
-      Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
-  if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit)
-    Max_Teams = RTLDeviceInfoTy::HardTeamLimit;
+      void *varptr;
+      uint32_t varsize;
 
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams);
-    DP("Max_Teams: %d\n", Max_Teams);
-    DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
-    DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size);
-    DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
-       RTLDeviceInfoTy::Default_WG_Size);
-    DP("thread_limit: %d\n", thread_limit);
-    DP("threadsPerGroup: %d\n", threadsPerGroup);
-    DP("ConstWGSize: %d\n", ConstWGSize);
-  }
-  // check for thread_limit() clause
-  if (thread_limit > 0) {
-    threadsPerGroup = thread_limit;
-    DP("Setting threads per block to requested %d\n", thread_limit);
-    // Add master warp for GENERIC
-    if (ExecutionMode ==
-        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
-      threadsPerGroup += WarpSize;
-      DP("Adding master wavefront: +%d threads\n", WarpSize);
-    }
-    if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
-      threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
-      DP("Setting threads per block to maximum %d\n", threadsPerGroup);
-    }
-  }
-  // check flat_max_work_group_size attr here
-  if (threadsPerGroup > ConstWGSize) {
-    threadsPerGroup = ConstWGSize;
-    DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
-       threadsPerGroup);
-  }
-  if (print_kernel_trace & STARTUP_DETAILS)
-    DP("threadsPerGroup: %d\n", threadsPerGroup);
-  DP("Preparing %d threads\n", threadsPerGroup);
+      auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
+      hsa_status_t err = interop_hsa_get_symbol_info(
+          SymbolInfoMap, device_id, e->name, &varptr, &varsize);
 
-  // Set default num_groups (teams)
-  if (Env.TeamLimit > 0)
-    num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit;
-  else
-    num_groups = Max_Teams;
-  DP("Set default num of groups %d\n", num_groups);
+      if (err != HSA_STATUS_SUCCESS) {
+        // Inform the user what symbol prevented offloading
+        DP("Loading global '%s' (Failed)\n", e->name);
+        return NULL;
+      }
 
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("num_groups: %d\n", num_groups);
-    DP("num_teams: %d\n", num_teams);
-  }
+      if (varsize != e->size) {
+        DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
+           varsize, e->size);
+        return NULL;
+      }
 
-  // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
-  // This reduction is typical for default case (no thread_limit clause).
-  // or when user goes crazy with num_teams clause.
-  // FIXME: We cant distinguish between a constant or variable thread limit.
-  // So we only handle constant thread_limits.
-  if (threadsPerGroup >
-      RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
-    // Should we round threadsPerGroup up to nearest WarpSize
-    // here?
-    num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
+      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+         DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
+      entry.addr = (void *)varptr;
 
-  // check for num_teams() clause
-  if (num_teams > 0) {
-    num_groups = (num_teams < num_groups) ? num_teams : num_groups;
-  }
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("num_groups: %d\n", num_groups);
-    DP("Env.NumTeams %d\n", Env.NumTeams);
-    DP("Env.TeamLimit %d\n", Env.TeamLimit);
-  }
+      DeviceInfo.addOffloadEntry(device_id, entry);
 
-  if (Env.NumTeams > 0) {
-    num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups;
-    DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
-  } else if (Env.TeamLimit > 0) {
-    num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups;
-    DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
-  } else {
-    if (num_teams <= 0) {
-      if (loop_tripcount > 0) {
-        if (ExecutionMode ==
-            llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
-          // round up to the nearest integer
-          num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
-        } else if (ExecutionMode ==
-                   llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
-          num_groups = loop_tripcount;
-        } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
-          // This is a generic kernel that was transformed to use SPMD-mode
-          // execution but uses Generic-mode semantics for scheduling.
-          num_groups = loop_tripcount;
-        }
-        DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
-           "threads per block %d\n",
-           num_groups, loop_tripcount, threadsPerGroup);
+      if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+          e->flags & OMP_DECLARE_TARGET_LINK) {
+        // If unified memory is present any target link variables
+        // can access host addresses directly. There is no longer a
+        // need for device copies.
+        err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
+                                                   sizeof(void *), device_id);
+        if (err != HSA_STATUS_SUCCESS)
+          DP("Error when copying USM\n");
+        DP("Copy linked variable host address (" DPxMOD ")"
+           "to device address (" DPxMOD ")\n",
+           DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
       }
-    } else {
-      num_groups = num_teams;
-    }
-    if (num_groups > Max_Teams) {
-      num_groups = Max_Teams;
-      if (print_kernel_trace & STARTUP_DETAILS)
-        DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams);
+
+      continue;
     }
-    if (num_groups > num_teams && num_teams > 0) {
-      num_groups = num_teams;
-      if (print_kernel_trace & STARTUP_DETAILS)
-        DP("Limiting num_groups %d to clause num_teams %d \n", num_groups,
-           num_teams);
+
+    DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
+
+    // errors in kernarg_segment_size previously treated as = 0 (or as undef)
+    uint32_t kernarg_segment_size = 0;
+    auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id];
+    hsa_status_t err = HSA_STATUS_SUCCESS;
+    if (!e->name) {
+      err = HSA_STATUS_ERROR;
+    } else {
+      std::string kernelStr = std::string(e->name);
+      auto It = KernelInfoMap.find(kernelStr);
+      if (It != KernelInfoMap.end()) {
+        atl_kernel_info_t info = It->second;
+        kernarg_segment_size = info.kernel_segment_size;
+      } else {
+        err = HSA_STATUS_ERROR;
+      }
     }
-  }
 
-  // num_teams clause always honored, no matter what, unless DEFAULT is active.
-  if (num_teams > 0) {
-    num_groups = num_teams;
-    // Cap num_groups to EnvMaxTeamsDefault if set.
-    if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault)
-      num_groups = Env.MaxTeamsDefault;
-  }
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("threadsPerGroup: %d\n", threadsPerGroup);
-    DP("num_groups: %d\n", num_groups);
-    DP("loop_tripcount: %ld\n", loop_tripcount);
-  }
-  DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
-     threadsPerGroup);
+    // default value GENERIC (in case symbol is missing from cubin file)
+    llvm::omp::OMPTgtExecModeFlags ExecModeVal =
+        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
 
-  launchVals res;
-  res.WorkgroupSize = threadsPerGroup;
-  res.GridSize = threadsPerGroup * num_groups;
-  return res;
-}
+    // get flat group size if present, else Default_WG_Size
+    int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
 
-static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
-  uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
-  bool full = true;
-  while (full) {
-    full =
-        packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue));
-  }
-  return packet_id;
-}
+    // get Kernel Descriptor if present.
+    // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
+    struct KernDescValType {
+      uint16_t Version;
+      uint16_t TSize;
+      uint16_t WG_Size;
+    };
+    struct KernDescValType KernDescVal;
+    std::string KernDescNameStr(e->name);
+    KernDescNameStr += "_kern_desc";
+    const char *KernDescName = KernDescNameStr.c_str();
 
-static int32_t __tgt_rtl_run_target_team_region_locked(
-    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
-    ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t num_teams,
-    int32_t thread_limit, uint64_t loop_tripcount);
+    void *KernDescPtr;
+    uint32_t KernDescSize;
+    void *CallStackAddr = nullptr;
+    err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+                                  KernDescName, &KernDescPtr, &KernDescSize);
 
-int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
-                                         void **tgt_args,
-                                         ptr
diff _t *tgt_offsets,
-                                         int32_t arg_num, int32_t num_teams,
-                                         int32_t thread_limit,
-                                         uint64_t loop_tripcount) {
+    if (err == HSA_STATUS_SUCCESS) {
+      if ((size_t)KernDescSize != sizeof(KernDescVal))
+        DP("Loading global computation properties '%s' - size mismatch (%u != "
+           "%lu)\n",
+           KernDescName, KernDescSize, sizeof(KernDescVal));
 
-  DeviceInfo.load_run_lock.lock_shared();
-  int32_t res = __tgt_rtl_run_target_team_region_locked(
-      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams,
-      thread_limit, loop_tripcount);
+      memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
 
-  DeviceInfo.load_run_lock.unlock_shared();
-  return res;
-}
+      // Check structure size against recorded size.
+      if ((size_t)KernDescSize != KernDescVal.TSize)
+        DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
+           sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
 
-int32_t __tgt_rtl_run_target_team_region_locked(
-    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
-    ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t num_teams,
-    int32_t thread_limit, uint64_t loop_tripcount) {
-  // Set the context we are using
-  // update thread limit content in gpu memory if un-initialized or specified
-  // from host
+      DP("After loading global for %s KernDesc \n", KernDescName);
+      DP("KernDesc: Version: %d\n", KernDescVal.Version);
+      DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
+      DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
 
-  DP("Run target team region thread_limit %d\n", thread_limit);
+      if (KernDescVal.WG_Size == 0) {
+        KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
+        DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
+      }
+      WGSizeVal = KernDescVal.WG_Size;
+      DP("WGSizeVal %d\n", WGSizeVal);
+      check("Loading KernDesc computation property", err);
+    } else {
+      DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
 
-  // All args are references.
-  std::vector<void *> args(arg_num);
-  std::vector<void *> ptrs(arg_num);
+      // Flat group size
+      std::string WGSizeNameStr(e->name);
+      WGSizeNameStr += "_wg_size";
+      const char *WGSizeName = WGSizeNameStr.c_str();
 
-  DP("Arg_num: %d\n", arg_num);
-  for (int32_t i = 0; i < arg_num; ++i) {
-    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
-    args[i] = &ptrs[i];
-    DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
-  }
+      void *WGSizePtr;
+      uint32_t WGSize;
+      err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+                                    WGSizeName, &WGSizePtr, &WGSize);
 
-  KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+      if (err == HSA_STATUS_SUCCESS) {
+        if ((size_t)WGSize != sizeof(int16_t)) {
+          DP("Loading global computation properties '%s' - size mismatch (%u "
+             "!= "
+             "%lu)\n",
+             WGSizeName, WGSize, sizeof(int16_t));
+          return NULL;
+        }
 
-  std::string kernel_name = std::string(KernelInfo->Name);
-  auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
-  if (KernelInfoTable[device_id].find(kernel_name) ==
-      KernelInfoTable[device_id].end()) {
-    DP("Kernel %s not found\n", kernel_name.c_str());
-    return OFFLOAD_FAIL;
-  }
+        memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
 
-  const atl_kernel_info_t KernelInfoEntry =
-      KernelInfoTable[device_id][kernel_name];
-  const uint32_t group_segment_size = KernelInfoEntry.group_segment_size;
-  const uint32_t sgpr_count = KernelInfoEntry.sgpr_count;
-  const uint32_t vgpr_count = KernelInfoEntry.vgpr_count;
-  const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
-  const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count;
+        DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
+
+        if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
+            WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
+          DP("Error wrong WGSize value specified in HSA code object file: "
+             "%d\n",
+             WGSizeVal);
+          WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
+        }
+      } else {
+        DP("Warning: Loading WGSize '%s' - symbol not found, "
+           "using default value %d\n",
+           WGSizeName, WGSizeVal);
+      }
 
-  assert(arg_num == (int)KernelInfoEntry.explicit_argument_count);
+      check("Loading WGSize computation property", err);
+    }
 
-  /*
-   * Set limit based on ThreadsPerGroup and GroupsPerDevice
-   */
-  launchVals LV =
-      getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env,
-                    KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
-                    num_teams,      // From run_region arg
-                    thread_limit,   // From run_region arg
-                    loop_tripcount, // From run_region arg
-                    DeviceInfo.NumTeams[KernelInfo->device_id]);
-  const int GridSize = LV.GridSize;
-  const int WorkgroupSize = LV.WorkgroupSize;
+    // Read execution mode from global in binary
+    std::string ExecModeNameStr(e->name);
+    ExecModeNameStr += "_exec_mode";
+    const char *ExecModeName = ExecModeNameStr.c_str();
 
-  if (print_kernel_trace >= LAUNCH) {
-    int num_groups = GridSize / WorkgroupSize;
-    // enum modes are SPMD, GENERIC, NONE 0,1,2
-    // if doing rtl timing, print to stderr, unless stdout requested.
-    bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
-    fprintf(traceToStdout ? stdout : stderr,
-            "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
-            "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
-            "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
-            device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
-            arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,
-            group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,
-            vgpr_spill_count, loop_tripcount, KernelInfo->Name);
-  }
+    void *ExecModePtr;
+    uint32_t varsize;
+    err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+                                  ExecModeName, &ExecModePtr, &varsize);
 
-  // Run on the device.
-  {
-    hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
-    if (!queue) {
-      return OFFLOAD_FAIL;
-    }
-    uint64_t packet_id = acquire_available_packet_id(queue);
+    if (err == HSA_STATUS_SUCCESS) {
+      if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
+        DP("Loading global computation properties '%s' - size mismatch(%u != "
+           "%lu)\n",
+           ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags));
+        return NULL;
+      }
 
-    const uint32_t mask = queue->size - 1; // size is a power of 2
-    hsa_kernel_dispatch_packet_t *packet =
-        (hsa_kernel_dispatch_packet_t *)queue->base_address +
-        (packet_id & mask);
+      memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
 
-    // packet->header is written last
-    packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-    packet->workgroup_size_x = WorkgroupSize;
-    packet->workgroup_size_y = 1;
-    packet->workgroup_size_z = 1;
-    packet->reserved0 = 0;
-    packet->grid_size_x = GridSize;
-    packet->grid_size_y = 1;
-    packet->grid_size_z = 1;
-    packet->private_segment_size = KernelInfoEntry.private_segment_size;
-    packet->group_segment_size = KernelInfoEntry.group_segment_size;
-    packet->kernel_object = KernelInfoEntry.kernel_object;
-    packet->kernarg_address = 0;     // use the block allocator
-    packet->reserved2 = 0;           // impl writes id_ here
-    packet->completion_signal = {0}; // may want a pool of signals
+      DP("After loading global for %s ExecMode = %d\n", ExecModeName,
+         ExecModeVal);
 
-    KernelArgPool *ArgPool = nullptr;
-    void *kernarg = nullptr;
-    {
-      auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
-      if (it != KernelArgPoolMap.end()) {
-        ArgPool = (it->second).get();
+      if (ExecModeVal < 0 ||
+          ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
+        DP("Error wrong exec_mode value specified in HSA code object file: "
+           "%d\n",
+           ExecModeVal);
+        return NULL;
       }
+    } else {
+      DP("Loading global exec_mode '%s' - symbol missing, using default "
+         "value "
+         "GENERIC (1)\n",
+         ExecModeName);
     }
-    if (!ArgPool) {
-      DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
-         device_id);
-    }
-    {
-      if (ArgPool) {
-        assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
-        kernarg = ArgPool->allocate(arg_num);
-      }
-      if (!kernarg) {
-        DP("Allocate kernarg failed\n");
-        return OFFLOAD_FAIL;
-      }
+    check("Loading computation property", err);
 
-      // Copy explicit arguments
-      for (int i = 0; i < arg_num; i++) {
-        memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
-      }
+    KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
+                                   CallStackAddr, e->name, kernarg_segment_size,
+                                   DeviceInfo.KernArgPool));
+    __tgt_offload_entry entry = *e;
+    entry.addr = (void *)&KernelsList.back();
+    DeviceInfo.addOffloadEntry(device_id, entry);
+    DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
+  }
 
-      // Initialize implicit arguments. TODO: Which of these can be dropped
-      impl_implicit_args_t *impl_args =
-          reinterpret_cast<impl_implicit_args_t *>(
-              static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
-      memset(impl_args, 0,
-             sizeof(impl_implicit_args_t)); // may not be necessary
-      impl_args->offset_x = 0;
-      impl_args->offset_y = 0;
-      impl_args->offset_z = 0;
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
 
-      // assign a hostcall buffer for the selected Q
-      if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) {
-        // hostrpc_assign_buffer is not thread safe, and this function is
-        // under a multiple reader lock, not a writer lock.
-        static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
-        pthread_mutex_lock(&hostcall_init_lock);
-        unsigned long buffer = hostrpc_assign_buffer(
-            DeviceInfo.HSAAgents[device_id], queue, device_id);
-        pthread_mutex_unlock(&hostcall_init_lock);
-        if (!buffer) {
-          DP("hostrpc_assign_buffer failed, gpu would dereference null and "
-             "error\n");
-          return OFFLOAD_FAIL;
-        }
+void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) {
+  void *ptr = NULL;
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
 
-        if (KernelInfoEntry.implicit_argument_count >= 4) {
-          // Initialise pointer for implicit_argument_count != 0 ABI
-          // Guess that the right implicit argument is at offset 24 after
-          // the explicit arguments. In the future, should be able to read
-          // the offset from msgpack. Clang is not annotating it at present.
-          uint64_t Offset =
-              sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
-          if ((Offset + 8) > (ArgPool->kernarg_segment_size)) {
-            DP("Bad offset of hostcall, exceeds kernarg segment size\n");
-          } else {
-            memcpy(static_cast<char *>(kernarg) + Offset, &buffer, 8);
-          }
-        }
+  if (kind != TARGET_ALLOC_DEFAULT) {
+    REPORT("Invalid target data allocation kind or requested allocator not "
+           "implemented yet\n");
+    return NULL;
+  }
 
-        // initialise pointer for implicit_argument_count == 0 ABI
-        impl_args->hostcall_ptr = buffer;
-      }
+  hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(device_id);
+  hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, &ptr);
+  DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
+     (long long unsigned)(Elf64_Addr)ptr);
+  ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL;
+  return ptr;
+}
 
-      packet->kernarg_address = kernarg;
-    }
+int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
+                              int64_t size) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  __tgt_async_info AsyncInfo;
+  int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
 
-    hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
-    if (s.handle == 0) {
-      DP("Failed to get signal instance\n");
-      return OFFLOAD_FAIL;
-    }
-    packet->completion_signal = s;
-    hsa_signal_store_relaxed(packet->completion_signal, 1);
+  return __tgt_rtl_synchronize(device_id, &AsyncInfo);
+}
 
-    // Publish the packet indicating it is ready to be processed
-    core::packet_store_release(reinterpret_cast<uint32_t *>(packet),
-                               core::create_header(), packet->setup);
+int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
+                                    int64_t size, __tgt_async_info *AsyncInfo) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  if (AsyncInfo) {
+    initAsyncInfo(AsyncInfo);
+    return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo);
+  } else {
+    return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
+  }
+}
 
-    // Since the packet is already published, its contents must not be
-    // accessed any more
-    hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
+                                int64_t size) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  __tgt_async_info AsyncInfo;
+  int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
 
-    while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
-                                     HSA_WAIT_STATE_BLOCKED) != 0)
-      ;
+  return __tgt_rtl_synchronize(device_id, &AsyncInfo);
+}
 
-    assert(ArgPool);
-    ArgPool->deallocate(kernarg);
-    DeviceInfo.FreeSignalPool.push(s);
-  }
+int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
+                                      void *tgt_ptr, int64_t size,
+                                      __tgt_async_info *AsyncInfo) {
+  assert(AsyncInfo && "AsyncInfo is nullptr");
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  initAsyncInfo(AsyncInfo);
+  return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo);
+}
 
-  DP("Kernel completed\n");
+int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  hsa_status_t err;
+  DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
+  err = core::Runtime::Memfree(tgt_ptr);
+  if (err != HSA_STATUS_SUCCESS) {
+    DP("Error when freeing CUDA memory\n");
+    return OFFLOAD_FAIL;
+  }
   return OFFLOAD_SUCCESS;
 }
 
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+                                         void **tgt_args,
+                                         ptr
diff _t *tgt_offsets,
+                                         int32_t arg_num, int32_t num_teams,
+                                         int32_t thread_limit,
+                                         uint64_t loop_tripcount) {
+
+  DeviceInfo.load_run_lock.lock_shared();
+  int32_t res = __tgt_rtl_run_target_team_region_locked(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams,
+      thread_limit, loop_tripcount);
+
+  DeviceInfo.load_run_lock.unlock_shared();
+  return res;
+}
+
 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
                                     void **tgt_args, ptr
diff _t *tgt_offsets,
                                     int32_t arg_num) {