[Openmp-commits] [openmp] 4031bb9 - [OpenMP] Refined CUDA plugin to put all CUDA operations into class

Mon Apr 13 10:32:52 PDT 2020

Author: Shilei Tian
Date: 2020-04-13T13:32:46-04:00
New Revision: 4031bb982b7a9bf8603851516ad72374ccc09a6f

URL: https://github.com/llvm/llvm-project/commit/4031bb982b7a9bf8603851516ad72374ccc09a6f
DIFF: https://github.com/llvm/llvm-project/commit/4031bb982b7a9bf8603851516ad72374ccc09a6f.diff

LOG: [OpenMP] Refined CUDA plugin to put all CUDA operations into class

Summary: Current implementation mixed everything up so that there is almost no encapsulation. In this patch, all CUDA related operations are put into a new class DeviceRTLTy and only necessary functions are exposed. In addition, all C++ code now conforms with LLVM code standard, keeping those API functions following C style.

Reviewers: jdoerfert

Reviewed By: jdoerfert

Subscribers: jfb, yaxunl, guansong, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D77951

Added: 
    

Modified: 
    openmp/libomptarget/plugins/cuda/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index 4c38ee8c2d28..4ad58e290252 100644

--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <atomic>
 #include <cassert>
 #include <cstddef>
 #include <cuda.h>
@@ -62,7 +61,7 @@ struct FuncOrGblEntryTy {
 
 enum ExecutionModeType {
   SPMD, // constructors, destructors,
-        // combined constructs (`teams distribute parallel for [simd]`)
+  // combined constructs (`teams distribute parallel for [simd]`)
   GENERIC, // everything else
   NONE
 };
@@ -100,18 +99,32 @@ bool checkResult(CUresult Err, const char *ErrMsg) {
   CUDA_ERR_STRING(Err);
   return false;
 }
-} // namespace
+
+// Structure contains per-device data
+struct DeviceDataTy {
+  std::list<FuncOrGblEntryTy> FuncGblEntries;
+  CUcontext Context = nullptr;
+  // Device properties
+  int ThreadsPerBlock = 0;
+  int BlocksPerGrid = 0;
+  int WarpSize = 0;
+  // OpenMP properties
+  int NumTeams = 0;
+  int NumThreads = 0;
+};
 
 class StreamManagerTy {
   int NumberOfDevices;
+  // The initial size of stream pool
+  int EnvNumInitialStreams;
   // Per-device stream mutex
   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
   // Per-device stream Id indicates the next available stream in the pool
   std::vector<int> NextStreamId;
   // Per-device stream pool
   std::vector<std::vector<CUstream>> StreamPool;
-  // Pointer to per-device context
-  std::vector<CUcontext> &ContextsPtr;
+  // Reference to per-device data
+  std::vector<DeviceDataTy> &DeviceData;
 
   // If there is no CUstream left in the pool, we will resize the pool to
   // allocate more CUstream. This function should be called with device mutex,
@@ -121,10 +134,8 @@ class StreamManagerTy {
     const size_t CurrentSize = Pool.size();
     assert(NewSize > CurrentSize && "new size is not larger than current size");
 
-    Pool.resize(NewSize, nullptr);
-
-    CUresult err = cuCtxSetCurrent(ContextsPtr[DeviceId]);
-    if (!checkResult(err, "Error when setting current CUDA context\n")) {
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
       // We will return if cannot switch to the right context in case of
       // creating bunch of streams that are not corresponding to the right
       // device. The offloading will fail later because selected CUstream is
@@ -132,29 +143,25 @@ class StreamManagerTy {
       return;
     }
 
+    Pool.resize(NewSize, nullptr);
+
     for (size_t I = CurrentSize; I < NewSize; ++I) {
-      err = cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING);
-      checkResult(err,
-                  "Error when creating CUDA stream to resize stream pool\n");
+      checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
+                  "Error returned from cuStreamCreate\n");
     }
   }
 
 public:
-  StreamManagerTy(const int NumberOfDevices, std::vector<CUcontext> &CtxPtr)
-      : NumberOfDevices(NumberOfDevices), ContextsPtr(CtxPtr) {
+  StreamManagerTy(const int NumberOfDevices,
+                  std::vector<DeviceDataTy> &DeviceData)
+      : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
+        DeviceData(DeviceData) {
     StreamPool.resize(NumberOfDevices);
     NextStreamId.resize(NumberOfDevices);
     StreamMtx.resize(NumberOfDevices);
 
-    // Initially let's create 32 streams for each device
-    int EnvNumInitialStreams = 32;
-    char *envStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS");
-    if (envStr)
-      EnvNumInitialStreams = std::stoi(envStr);
-
-    // Initialize the stream pool for each device
-    for (std::vector<CUstream> &S : StreamPool)
-      S.resize(EnvNumInitialStreams);
+    if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
+      EnvNumInitialStreams = std::stoi(EnvStr);
 
     // Initialize the next stream id
     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
@@ -167,14 +174,13 @@ class StreamManagerTy {
   ~StreamManagerTy() {
     // Destroy streams
     for (int I = 0; I < NumberOfDevices; ++I) {
-      CUresult err = cuCtxSetCurrent(ContextsPtr[I]);
-      checkResult(err, "Error when setting current CUDA context\n");
+      checkResult(cuCtxSetCurrent(DeviceData[I].Context),
+                  "Error returned from cuCtxSetCurrent\n");
 
       for (CUstream &S : StreamPool[I]) {
-        if (!S)
-          continue;
-        err = cuStreamDestroy(S);
-        checkResult(err, "Error when destroying CUDA stream\n");
+        if (S)
+          checkResult(cuStreamDestroy(S),
+                      "Error returned from cuStreamDestroy\n");
       }
     }
   }
@@ -192,10 +198,6 @@ class StreamManagerTy {
   //       ^
   //       id
   CUstream getStream(const int DeviceId) {
-    assert(DeviceId >= 0 &&
-           static_cast<size_t>(DeviceId) < NextStreamId.size() &&
-           "Unexpected device id");
-
     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
     int &Id = NextStreamId[DeviceId];
     // No CUstream left in the pool, we need to request from CUDA RT
@@ -221,632 +223,709 @@ class StreamManagerTy {
   // Therefore, after several execution, the order of pool might be 
diff erent
   // from its initial state.
   void returnStream(const int DeviceId, CUstream Stream) {
-    assert(DeviceId >= 0 &&
-           static_cast<size_t>(DeviceId) < NextStreamId.size() &&
-           "Unexpected device id");
-
     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
     int &Id = NextStreamId[DeviceId];
     assert(Id > 0 && "Wrong stream ID");
     StreamPool[DeviceId][--Id] = Stream;
   }
 
-  void initializeDevice(int DeviceId) {
-    // This function should be called after setting right context
-    for (CUstream &Stream : StreamPool[DeviceId]) {
-      CUresult Err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
-      checkResult(Err, "Error when creating CUDA stream\n");
-    }
-  }
-};
+  bool initializeDeviceStreamPool(const int DeviceId) {
+    assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
 
-/// Class containing all the device information.
-class RTLDeviceInfoTy {
-  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
-  std::shared_ptr<StreamManagerTy> StreamManager;
+    resizeStreamPool(DeviceId, EnvNumInitialStreams);
 
-public:
-  int NumberOfDevices;
-  std::vector<CUmodule> Modules;
-  std::vector<CUcontext> Contexts;
+    // Check the size of stream pool
+    if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
+      return false;
 
-  // Device properties
-  std::vector<int> ThreadsPerBlock;
-  std::vector<int> BlocksPerGrid;
-  std::vector<int> WarpSize;
+    // Check whether each stream is valid
+    for (CUstream &S : StreamPool[DeviceId])
+      if (!S)
+        return false;
 
-  // OpenMP properties
-  std::vector<int> NumTeams;
-  std::vector<int> NumThreads;
+    return true;
+  }
+};
 
-  // OpenMP Environment properties
+class DeviceRTLTy {
+  int NumberOfDevices;
+  // OpenMP environment properties
   int EnvNumTeams;
   int EnvTeamLimit;
-
-  // OpenMP Requires Flags
+  // OpenMP requires flags
   int64_t RequiresFlags;
 
-  // static int EnvNumThreads;
-  static const int HardTeamLimit = 1 << 16; // 64k
-  static const int HardThreadLimit = 1024;
-  static const int DefaultNumTeams = 128;
-  static const int DefaultNumThreads = 128;
-
-  std::shared_ptr<StreamManagerTy> getStreamManager() { return StreamManager; }
-
-  CUstream getStream(const int DeviceId) {
-    return StreamManager->getStream(DeviceId);
-  }
+  static constexpr const int HardTeamLimit = 1U << 16U; // 64k
+  static constexpr const int HardThreadLimit = 1024;
+  static constexpr const int DefaultNumTeams = 128;
+  static constexpr const int DefaultNumThreads = 128;
 
-  void returnStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) {
-    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
-    assert(AsyncInfoPtr->Queue && "AsyncInfoPtr->Queue is nullptr");
-
-    StreamManager->returnStream(
-        DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
-    AsyncInfoPtr->Queue = nullptr;
-  }
+  std::unique_ptr<StreamManagerTy> StreamManager;
+  std::vector<DeviceDataTy> DeviceData;
+  std::vector<CUmodule> Modules;
 
   // Record entry point associated with device
-  void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
-
+  void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
+    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
     E.Entries.push_back(entry);
   }
 
   // Return true if the entry is associated with device
-  bool findOffloadEntry(int32_t device_id, void *addr) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
-
-    for (auto &it : E.Entries) {
-      if (it.addr == addr)
+  bool findOffloadEntry(const int DeviceId, const void *Addr) const {
+    for (const __tgt_offload_entry &Itr :
+         DeviceData[DeviceId].FuncGblEntries.back().Entries)
+      if (Itr.addr == Addr)
         return true;
-    }
 
     return false;
   }
 
   // Return the pointer to the target entries table
-  __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
-
-    int32_t size = E.Entries.size();
+  __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
+    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
 
-    // Table is empty
-    if (!size)
-      return 0;
-
-    __tgt_offload_entry *begin = &E.Entries[0];
-    __tgt_offload_entry *end = &E.Entries[size - 1];
+    if (E.Entries.empty())
+      return nullptr;
 
     // Update table info according to the entries and return the pointer
-    E.Table.EntriesBegin = begin;
-    E.Table.EntriesEnd = ++end;
+    E.Table.EntriesBegin = E.Entries.data();
+    E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
 
     return &E.Table;
   }
 
   // Clear entries table for a device
-  void clearOffloadEntriesTable(int32_t device_id) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncGblEntries[device_id].emplace_back();
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+  void clearOffloadEntriesTable(const int DeviceId) {
+    DeviceData[DeviceId].FuncGblEntries.emplace_back();
+    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
     E.Entries.clear();
-    E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
+    E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
+  }
+
+  CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+
+    if (!AsyncInfoPtr->Queue)
+      AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
+
+    return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
   }
 
-  RTLDeviceInfoTy() {
+public:
+  // This class should not be copied
+  DeviceRTLTy(const DeviceRTLTy &) = delete;
+  DeviceRTLTy(DeviceRTLTy &&) = delete;
+
+  DeviceRTLTy()
+      : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
+        RequiresFlags(OMP_REQ_UNDEFINED) {
 #ifdef OMPTARGET_DEBUG
-    if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
-      DebugLevel = std::stoi(envStr);
-    }
+    if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
+      DebugLevel = std::stoi(EnvStr);
 #endif // OMPTARGET_DEBUG
 
     DP("Start initializing CUDA\n");
 
-    CUresult err = cuInit(0);
-    if (err != CUDA_SUCCESS) {
-      DP("Error when initializing CUDA\n");
-      CUDA_ERR_STRING(err);
+    CUresult Err = cuInit(0);
+    if (!checkResult(Err, "Error returned from cuInit\n")) {
       return;
     }
 
-    NumberOfDevices = 0;
-
-    err = cuDeviceGetCount(&NumberOfDevices);
-    if (err != CUDA_SUCCESS) {
-      DP("Error when getting CUDA device count\n");
-      CUDA_ERR_STRING(err);
+    Err = cuDeviceGetCount(&NumberOfDevices);
+    if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
       return;
-    }
 
     if (NumberOfDevices == 0) {
       DP("There are no devices supporting CUDA.\n");
       return;
     }
 
-    FuncGblEntries.resize(NumberOfDevices);
-    Contexts.resize(NumberOfDevices);
-    ThreadsPerBlock.resize(NumberOfDevices);
-    BlocksPerGrid.resize(NumberOfDevices);
-    WarpSize.resize(NumberOfDevices);
-    NumTeams.resize(NumberOfDevices);
-    NumThreads.resize(NumberOfDevices);
+    DeviceData.resize(NumberOfDevices);
 
     // Get environment variables regarding teams
-    char *envStr = getenv("OMP_TEAM_LIMIT");
-    if (envStr) {
+    if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
       // OMP_TEAM_LIMIT has been set
-      EnvTeamLimit = std::stoi(envStr);
+      EnvTeamLimit = std::stoi(EnvStr);
       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
-    } else {
-      EnvTeamLimit = -1;
     }
-    envStr = getenv("OMP_NUM_TEAMS");
-    if (envStr) {
+    if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
       // OMP_NUM_TEAMS has been set
-      EnvNumTeams = std::stoi(envStr);
+      EnvNumTeams = std::stoi(EnvStr);
       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
-    } else {
-      EnvNumTeams = -1;
     }
 
     StreamManager =
-        std::make_shared<StreamManagerTy>(NumberOfDevices, Contexts);
-
-    // Default state.
-    RequiresFlags = OMP_REQ_UNDEFINED;
+        std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
   }
 
-  ~RTLDeviceInfoTy() {
+  ~DeviceRTLTy() {
     // First destruct stream manager in case of Contexts is destructed before it
     StreamManager = nullptr;
 
-    // Close modules
-    for (auto &module : Modules)
-      if (module) {
-        CUresult err = cuModuleUnload(module);
-        if (err != CUDA_SUCCESS) {
-          DP("Error when unloading CUDA module\n");
-          CUDA_ERR_STRING(err);
-        }
-      }
+    for (CUmodule &M : Modules)
+      // Close module
+      if (M)
+        checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
 
-    // Destroy contexts
-    for (auto &ctx : Contexts)
-      if (ctx) {
-        CUresult err = cuCtxDestroy(ctx);
-        if (err != CUDA_SUCCESS) {
-          DP("Error when destroying CUDA context\n");
-          CUDA_ERR_STRING(err);
-        }
-      }
+    for (DeviceDataTy &D : DeviceData) {
+      // Destroy context
+      if (D.Context)
+        checkResult(cuCtxDestroy(D.Context),
+                    "Error returned from cuCtxDestroy\n");
+    }
   }
-};
-
-static RTLDeviceInfoTy DeviceInfo;
 
-namespace {
-CUstream getStream(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr) {
-  assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+  // Check whether a given DeviceId is valid
+  bool isValidDeviceId(const int DeviceId) const {
+    return DeviceId >= 0 && DeviceId < NumberOfDevices;
+  }
 
-  if (!AsyncInfoPtr->Queue)
-    AsyncInfoPtr->Queue = DeviceInfo.getStream(DeviceId);
+  bool getNumOfDevices() const { return NumberOfDevices; }
+
+  void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
+
+  int initDevice(const int DeviceId) {
+    CUdevice Device;
+
+    DP("Getting device %d\n", DeviceId);
+    CUresult Err = cuDeviceGet(&Device, DeviceId);
+    if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
+      return OFFLOAD_FAIL;
+
+    // Create the context and save it to use whenever this device is selected.
+    Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC,
+                      Device);
+    if (!checkResult(Err, "Error returned from cuCtxCreate\n"))
+      return OFFLOAD_FAIL;
+
+    Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    // Initialize stream pool
+    if (!StreamManager->initializeDeviceStreamPool(DeviceId))
+      return OFFLOAD_FAIL;
+
+    // Query attributes to determine number of threads/block and blocks/grid.
+    int MaxGridDimX;
+    Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                               Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting max grid dimension, use default value %d\n",
+         DeviceRTLTy::DefaultNumTeams);
+      DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
+    } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
+      DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
+      DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
+    } else {
+      DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+         "at the hard limit\n",
+         MaxGridDimX, DeviceRTLTy::HardTeamLimit);
+      DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
+    }
 
-  return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
-}
+    // We are only exploiting threads along the x axis.
+    int MaxBlockDimX;
+    Err = cuDeviceGetAttribute(&MaxBlockDimX,
+                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting max block dimension, use default value %d\n",
+         DeviceRTLTy::DefaultNumThreads);
+      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
+    } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
+      DP("Using %d CUDA threads per block\n", MaxBlockDimX);
+      DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
+    } else {
+      DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
+         "capping at the hard limit\n",
+         MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
+      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
+    }
 
-int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
-                     __tgt_async_info *AsyncInfoPtr) {
-  assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting CUDA context\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+    // Get and set warp size
+    int WarpSize;
+    Err =
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting warp size, assume default value 32\n");
+      DeviceData[DeviceId].WarpSize = 32;
+    } else {
+      DP("Using warp size %d\n", WarpSize);
+      DeviceData[DeviceId].WarpSize = WarpSize;
+    }
 
-  CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
+    // Adjust teams to the env variables
+    if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
+      DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
+         EnvTeamLimit);
+      DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
+    }
 
-  err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when copying data from device to host. Pointers: host = " DPxMOD
-       ", device = " DPxMOD ", size = %" PRId64 "\n",
-       DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+    DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
+       DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock,
+       DeviceData[DeviceId].WarpSize);
 
-  return OFFLOAD_SUCCESS;
-}
+    // Set default number of teams
+    if (EnvNumTeams > 0) {
+      DP("Default number of teams set according to environment %d\n",
+         EnvNumTeams);
+      DeviceData[DeviceId].NumTeams = EnvNumTeams;
+    } else {
+      DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
+      DP("Default number of teams set according to library's default %d\n",
+         DeviceRTLTy::DefaultNumTeams);
+    }
 
-int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
-                   __tgt_async_info *AsyncInfoPtr) {
-  assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting CUDA context\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+    if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
+      DP("Default number of teams exceeds device limit, capping at %d\n",
+         DeviceData[DeviceId].BlocksPerGrid);
+      DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
+    }
 
-  CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
+    // Set default number of threads
+    DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
+    DP("Default number of threads set according to library's default %d\n",
+       DeviceRTLTy::DefaultNumThreads);
+    if (DeviceData[DeviceId].NumThreads >
+        DeviceData[DeviceId].ThreadsPerBlock) {
+      DP("Default number of threads exceeds device limit, capping at %d\n",
+         DeviceData[DeviceId].ThreadsPerBlock);
+      DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
+    }
 
-  err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when copying data from host to device. Pointers: host = " DPxMOD
-       ", device = " DPxMOD ", size = %" PRId64 "\n",
-       DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
+    return OFFLOAD_SUCCESS;
   }
 
-  return OFFLOAD_SUCCESS;
-}
-} // namespace
+  __tgt_target_table *loadBinary(const int DeviceId,
+                                 const __tgt_device_image *Image) {
+    // Set the context we are using
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return nullptr;
+
+    // Clear the offload table as we are going to create a new one.
+    clearOffloadEntriesTable(DeviceId);
+
+    // Create the module and extract the function pointers.
+    CUmodule Module;
+    DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
+    Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
+    if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
+      return nullptr;
+
+    DP("CUDA module successfully loaded!\n");
+
+    Modules.push_back(Module);
+
+    // Find the symbols in the module by name.
+    const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
+    const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
+
+    for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
+      if (!E->addr) {
+        // We return nullptr when something like this happens, the host should
+        // have always something in the address to uniquely identify the target
+        // region.
+        DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
+        return nullptr;
+      }
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+      if (E->size) {
+        __tgt_offload_entry Entry = *E;
+        CUdeviceptr CUPtr;
+        size_t CUSize;
+        Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
+        // We keep this style here because we need the name
+        if (Err != CUDA_SUCCESS) {
+          DP("Loading global '%s' (Failed)\n", E->name);
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
 
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
-  return elf_check_machine(image, 190); // EM_CUDA = 190.
-}
+        if (CUSize != E->size) {
+          DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
+             CUSize, E->size);
+          return nullptr;
+        }
 
-int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
+        DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+           DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
+
+        Entry.addr = (void *)(CUPtr);
+
+        // Note: In the current implementation declare target variables
+        // can either be link or to. This means that once unified
+        // memory is activated via the requires directive, the variable
+        // can be used directly from the host in both cases.
+        // TODO: when variables types other than to or link are added,
+        // the below condition should be changed to explicitly
+        // check for to and link variables types:
+        // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
+        // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
+        if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+          // If unified memory is present any target link or to variables
+          // can access host addresses directly. There is no longer a
+          // need for device copies.
+          cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
+          DP("Copy linked variable host address (" DPxMOD
+             ") to device address (" DPxMOD ")\n",
+             DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
+        }
 
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  DP("Init requires flags to %ld\n", RequiresFlags);
-  DeviceInfo.RequiresFlags = RequiresFlags;
-  return RequiresFlags;
-}
+        addOffloadEntry(DeviceId, Entry);
 
-int32_t __tgt_rtl_init_device(int32_t device_id) {
+        continue;
+      }
 
-  CUdevice cuDevice;
-  DP("Getting device %d\n", device_id);
-  CUresult err = cuDeviceGet(&cuDevice, device_id);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when getting CUDA device with id = %d\n", device_id);
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+      CUfunction Func;
+      Err = cuModuleGetFunction(&Func, Module, E->name);
+      // We keep this style here because we need the name
+      if (Err != CUDA_SUCCESS) {
+        DP("Loading '%s' (Failed)\n", E->name);
+        CUDA_ERR_STRING(Err);
+        return nullptr;
+      }
 
-  // Create the context and save it to use whenever this device is selected.
-  err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
-                    cuDevice);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when creating a CUDA context\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+      DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
+         DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
+
+      // default value GENERIC (in case symbol is missing from cubin file)
+      int8_t ExecModeVal = ExecutionModeType::GENERIC;
+      std::string ExecModeNameStr(E->name);
+      ExecModeNameStr += "_exec_mode";
+      const char *ExecModeName = ExecModeNameStr.c_str();
+
+      CUdeviceptr ExecModePtr;
+      size_t CUSize;
+      Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
+      if (Err == CUDA_SUCCESS) {
+        if (CUSize != sizeof(int8_t)) {
+          DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
+             ExecModeName, CUSize, sizeof(int8_t));
+          return nullptr;
+        }
 
-  err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting current CUDA context\n");
-    CUDA_ERR_STRING(err);
-  }
+        Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
+        if (Err != CUDA_SUCCESS) {
+          DP("Error when copying data from device to host. Pointers: "
+             "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
+             DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
 
-  // Initialize stream pool
-  DeviceInfo.getStreamManager()->initializeDevice(device_id);
-
-  // Query attributes to determine number of threads/block and blocks/grid.
-  int maxGridDimX;
-  err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                             cuDevice);
-  if (err != CUDA_SUCCESS) {
-    DP("Error getting max grid dimension, use default\n");
-    DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
-  } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
-    DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
-    DP("Using %d CUDA blocks per grid\n", maxGridDimX);
-  } else {
-    DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
-    DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
-       "at the hard limit\n",
-       maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
-  }
+        if (ExecModeVal < 0 || ExecModeVal > 1) {
+          DP("Error wrong exec_mode value specified in cubin file: %d\n",
+             ExecModeVal);
+          return nullptr;
+        }
+      } else {
+        DP("Loading global exec_mode '%s' - symbol missing, using default "
+           "value GENERIC (1)\n",
+           ExecModeName);
+        CUDA_ERR_STRING(Err);
+      }
 
-  // We are only exploiting threads along the x axis.
-  int maxBlockDimX;
-  err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                             cuDevice);
-  if (err != CUDA_SUCCESS) {
-    DP("Error getting max block dimension, use default\n");
-    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
-  } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
-    DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
-    DP("Using %d CUDA threads per block\n", maxBlockDimX);
-  } else {
-    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
-    DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
-       "at the hard limit\n",
-       maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
-  }
+      KernelsList.emplace_back(Func, ExecModeVal);
 
-  int warpSize;
-  err =
-      cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
-  if (err != CUDA_SUCCESS) {
-    DP("Error getting warp size, assume default\n");
-    DeviceInfo.WarpSize[device_id] = 32;
-  } else {
-    DeviceInfo.WarpSize[device_id] = warpSize;
-  }
+      __tgt_offload_entry Entry = *E;
+      Entry.addr = &KernelsList.back();
+      addOffloadEntry(DeviceId, Entry);
+    }
 
-  // Adjust teams to the env variables
-  if (DeviceInfo.EnvTeamLimit > 0 &&
-      DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
-    DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
-    DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
-        DeviceInfo.EnvTeamLimit);
-  }
+    // send device environment data to the device
+    {
+      omptarget_device_environmentTy DeviceEnv{0};
 
-  DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
-     DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
-     DeviceInfo.WarpSize[device_id]);
-
-  // Set default number of teams
-  if (DeviceInfo.EnvNumTeams > 0) {
-    DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
-    DP("Default number of teams set according to environment %d\n",
-        DeviceInfo.EnvNumTeams);
-  } else {
-    DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
-    DP("Default number of teams set according to library's default %d\n",
-        RTLDeviceInfoTy::DefaultNumTeams);
-  }
-  if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
-    DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
-    DP("Default number of teams exceeds device limit, capping at %d\n",
-        DeviceInfo.BlocksPerGrid[device_id]);
-  }
+#ifdef OMPTARGET_DEBUG
+      if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
+        DeviceEnv.debug_level = std::stoi(EnvStr);
+#endif
 
-  // Set default number of threads
-  DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
-  DP("Default number of threads set according to library's default %d\n",
-          RTLDeviceInfoTy::DefaultNumThreads);
-  if (DeviceInfo.NumThreads[device_id] >
-      DeviceInfo.ThreadsPerBlock[device_id]) {
-    DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
-    DP("Default number of threads exceeds device limit, capping at %d\n",
-        DeviceInfo.ThreadsPerBlock[device_id]);
-  }
+      const char *DeviceEnvName = "omptarget_device_environment";
+      CUdeviceptr DeviceEnvPtr;
+      size_t CUSize;
+
+      Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
+      if (Err == CUDA_SUCCESS) {
+        if (CUSize != sizeof(DeviceEnv)) {
+          DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
+             DeviceEnvName, CUSize, sizeof(int32_t));
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
 
-  return OFFLOAD_SUCCESS;
-}
+        Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
+        if (Err != CUDA_SUCCESS) {
+          DP("Error when copying data from host to device. Pointers: "
+             "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
+             DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
 
-__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
-    __tgt_device_image *image) {
-
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting a CUDA context for device %d\n", device_id);
-    CUDA_ERR_STRING(err);
-    return NULL;
+        DP("Sending global device environment data %zu bytes\n", CUSize);
+      } else {
+        DP("Finding global device environment '%s' - symbol missing.\n",
+           DeviceEnvName);
+        DP("Continue, considering this is a device RTL which does not accept "
+           "environment setting.\n");
+      }
+    }
+
+    return getOffloadEntriesTable(DeviceId);
   }
 
-  // Clear the offload table as we are going to create a new one.
-  DeviceInfo.clearOffloadEntriesTable(device_id);
+  void *dataAlloc(const int DeviceId, const int64_t Size) const {
+    if (Size == 0)
+      return nullptr;
 
-  // Create the module and extract the function pointers.
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return nullptr;
 
-  CUmodule cumod;
-  DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
-  err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when loading CUDA module\n");
-    CUDA_ERR_STRING(err);
-    return NULL;
-  }
+    CUdeviceptr DevicePtr;
+    Err = cuMemAlloc(&DevicePtr, Size);
+    if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
+      return nullptr;
 
-  DP("CUDA module successfully loaded!\n");
-  DeviceInfo.Modules.push_back(cumod);
+    return (void *)DevicePtr;
+  }
 
-  // Find the symbols in the module by name.
-  __tgt_offload_entry *HostBegin = image->EntriesBegin;
-  __tgt_offload_entry *HostEnd = image->EntriesEnd;
+  int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
+                 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
 
-  for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
 
-    if (!e->addr) {
-      // We return NULL when something like this happens, the host should have
-      // always something in the address to uniquely identify the target region.
-      DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
+    CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
 
-      return NULL;
+    Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error when copying data from host to device. Pointers: host = " DPxMOD
+         ", device = " DPxMOD ", size = %" PRId64 "\n",
+         DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+      CUDA_ERR_STRING(Err);
+      return OFFLOAD_FAIL;
     }
 
-    if (e->size) {
-      __tgt_offload_entry entry = *e;
+    return OFFLOAD_SUCCESS;
+  }
+
+  int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
+                   const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
 
-      CUdeviceptr cuptr;
-      size_t cusize;
-      err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
 
-      if (err != CUDA_SUCCESS) {
-        DP("Loading global '%s' (Failed)\n", e->name);
-        CUDA_ERR_STRING(err);
-        return NULL;
-      }
+    CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
 
-      if (cusize != e->size) {
-        DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
-            cusize, e->size);
-        CUDA_ERR_STRING(err);
-        return NULL;
-      }
+    Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error when copying data from device to host. Pointers: host = " DPxMOD
+         ", device = " DPxMOD ", size = %" PRId64 "\n",
+         DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+      CUDA_ERR_STRING(Err);
+      return OFFLOAD_FAIL;
+    }
 
-      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-          DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
-      entry.addr = (void *)cuptr;
-
-      // Note: In the current implementation declare target variables
-      // can either be link or to. This means that once unified
-      // memory is activated via the requires directive, the variable
-      // can be used directly from the host in both cases.
-      // TODO: when variables types other than to or link are added,
-      // the below condition should be changed to explicitly
-      // check for to and link variables types:
-      //  (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-      //   (e->flags & OMP_DECLARE_TARGET_LINK ||
-      //    e->flags == OMP_DECLARE_TARGET_TO))
-      if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
-        // If unified memory is present any target link or to variables
-        // can access host addresses directly. There is no longer a
-        // need for device copies.
-        cuMemcpyHtoD(cuptr, e->addr, sizeof(void *));
-        DP("Copy linked variable host address (" DPxMOD ")"
-           "to device address (" DPxMOD ")\n",
-          DPxPTR(*((void**)e->addr)), DPxPTR(cuptr));
-      }
+    return OFFLOAD_SUCCESS;
+  }
 
-      DeviceInfo.addOffloadEntry(device_id, entry);
+  int dataDelete(const int DeviceId, void *TgtPtr) const {
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
 
-      continue;
-    }
+    Err = cuMemFree((CUdeviceptr)TgtPtr);
+    if (!checkResult(Err, "Error returned from cuMemFree\n"))
+      return OFFLOAD_FAIL;
 
-    CUfunction fun;
-    err = cuModuleGetFunction(&fun, cumod, e->name);
+    return OFFLOAD_SUCCESS;
+  }
 
-    if (err != CUDA_SUCCESS) {
-      DP("Loading '%s' (Failed)\n", e->name);
-      CUDA_ERR_STRING(err);
-      return NULL;
+  int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,
+                          void **TgtArgs, ptr
diff _t *TgtOffsets,
+                          const int ArgNum, const int TeamNum,
+                          const int ThreadLimit,
+                          const unsigned int LoopTripCount,
+                          __tgt_async_info *AsyncInfo) const {
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    // All args are references.
+    std::vector<void *> Args(ArgNum);
+    std::vector<void *> Ptrs(ArgNum);
+
+    for (int I = 0; I < ArgNum; ++I) {
+      Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
+      Args[I] = &Ptrs[I];
     }
 
-    DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
-        DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
-
-    // default value GENERIC (in case symbol is missing from cubin file)
-    int8_t ExecModeVal = ExecutionModeType::GENERIC;
-    std::string ExecModeNameStr (e->name);
-    ExecModeNameStr += "_exec_mode";
-    const char *ExecModeName = ExecModeNameStr.c_str();
-
-    CUdeviceptr ExecModePtr;
-    size_t cusize;
-    err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
-    if (err == CUDA_SUCCESS) {
-      if ((size_t)cusize != sizeof(int8_t)) {
-        DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
-           ExecModeName, cusize, sizeof(int8_t));
-        CUDA_ERR_STRING(err);
-        return NULL;
+    const KernelTy *KernelInfo =
+        reinterpret_cast<const KernelTy *>(TgtEntryPtr);
+
+    unsigned int CudaThreadsPerBlock;
+    if (ThreadLimit > 0) {
+      DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
+      CudaThreadsPerBlock = ThreadLimit;
+      // Add master warp if necessary
+      if (KernelInfo->ExecutionMode == GENERIC) {
+        DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
+        CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
       }
+    } else {
+      DP("Setting CUDA threads per block to default %d\n",
+         DeviceData[DeviceId].NumThreads);
+      CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
+    }
 
-      err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
-      if (err != CUDA_SUCCESS) {
-        DP("Error when copying data from device to host. Pointers: "
-           "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
-           DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
-        CUDA_ERR_STRING(err);
-        return NULL;
-      }
+    if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
+      DP("Threads per block capped at device limit %d\n",
+         DeviceData[DeviceId].ThreadsPerBlock);
+      CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
+    }
+
+    int KernelLimit;
+    Err = cuFuncGetAttribute(&KernelLimit,
+                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                             KernelInfo->Func);
+    if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {
+      DP("Threads per block capped at kernel limit %d\n", KernelLimit);
+      CudaThreadsPerBlock = KernelLimit;
+    }
 
-      if (ExecModeVal < 0 || ExecModeVal > 1) {
-        DP("Error wrong exec_mode value specified in cubin file: %d\n",
-           ExecModeVal);
-        return NULL;
+    unsigned int CudaBlocksPerGrid;
+    if (TeamNum <= 0) {
+      if (LoopTripCount > 0 && EnvNumTeams < 0) {
+        if (KernelInfo->ExecutionMode == SPMD) {
+          // We have a combined construct, i.e. `target teams distribute
+          // parallel for [simd]`. We launch so many teams so that each thread
+          // will execute one iteration of the loop. round up to the nearest
+          // integer
+          CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
+        } else {
+          // If we reach this point, then we have a non-combined construct, i.e.
+          // `teams distribute` with a nested `parallel for` and each team is
+          // assigned one iteration of the `distribute` loop. E.g.:
+          //
+          // #pragma omp target teams distribute
+          // for(...loop_tripcount...) {
+          //   #pragma omp parallel for
+          //   for(...) {}
+          // }
+          //
+          // Threads within a team will execute the iterations of the `parallel`
+          // loop.
+          CudaBlocksPerGrid = LoopTripCount;
+        }
+        DP("Using %d teams due to loop trip count %" PRIu64
+           " and number of threads per block %d\n",
+           CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
+      } else {
+        DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
+        CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
       }
+    } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
+      DP("Capping number of teams to team limit %d\n",
+         DeviceData[DeviceId].BlocksPerGrid);
+      CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
     } else {
-      DP("Loading global exec_mode '%s' - symbol missing, using default value "
-          "GENERIC (1)\n", ExecModeName);
-      CUDA_ERR_STRING(err);
+      DP("Using requested number of teams %d\n", TeamNum);
+      CudaBlocksPerGrid = TeamNum;
     }
 
-    KernelsList.push_back(KernelTy(fun, ExecModeVal));
+    // Run on the device.
+    DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid,
+       CudaThreadsPerBlock);
 
-    __tgt_offload_entry entry = *e;
-    entry.addr = (void *)&KernelsList.back();
-    DeviceInfo.addOffloadEntry(device_id, entry);
-  }
+    CUstream Stream = getStream(DeviceId, AsyncInfo);
+    Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
+                         /* gridDimZ */ 1, CudaThreadsPerBlock,
+                         /* blockDimY */ 1, /* blockDimZ */ 1,
+                         /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+    if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
+      return OFFLOAD_FAIL;
 
-  // send device environment data to the device
-  {
-    omptarget_device_environmentTy device_env;
+    DP("Launch of entry point at " DPxMOD " successful!\n",
+       DPxPTR(TgtEntryPtr));
 
-    device_env.debug_level = 0;
+    return OFFLOAD_SUCCESS;
+  }
 
-#ifdef OMPTARGET_DEBUG
-    if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
-      device_env.debug_level = std::stoi(envStr);
+  int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
+    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
+    CUresult Err = cuStreamSynchronize(Stream);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error when synchronizing stream. stream = " DPxMOD
+         ", async info ptr = " DPxMOD "\n",
+         DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
+      CUDA_ERR_STRING(Err);
+      return OFFLOAD_FAIL;
     }
-#endif
 
-    const char * device_env_Name="omptarget_device_environment";
-    CUdeviceptr device_env_Ptr;
-    size_t cusize;
+    // Once the stream is synchronized, return it to stream pool and reset
+    // async_info. This is to make sure the synchronization only works for its
+    // own tasks.
+    StreamManager->returnStream(
+        DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
+    AsyncInfoPtr->Queue = nullptr;
 
-    err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
+    return OFFLOAD_SUCCESS;
+  }
+};
 
-    if (err == CUDA_SUCCESS) {
-      if ((size_t)cusize != sizeof(device_env)) {
-        DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
-            device_env_Name, cusize, sizeof(int32_t));
-        CUDA_ERR_STRING(err);
-        return NULL;
-      }
+DeviceRTLTy DeviceRTL;
+} // namespace
 
-      err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
-      if (err != CUDA_SUCCESS) {
-        DP("Error when copying data from host to device. Pointers: "
-            "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
-            DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
-        CUDA_ERR_STRING(err);
-        return NULL;
-      }
+// Exposed library API function
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-      DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
-    } else {
-      DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
-      DP("Continue, considering this is a device RTL which does not accept environment setting.\n");
-    }
-  }
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+  return elf_check_machine(image, /* EM_CUDA */ 190);
+}
+
+int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
 
-  return DeviceInfo.getOffloadEntriesTable(device_id);
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+  DP("Init requires flags to %ld\n", RequiresFlags);
+  DeviceRTL.setRequiresFlag(RequiresFlags);
+  return RequiresFlags;
 }
 
-void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
-  if (size == 0) {
-    return NULL;
-  }
+int32_t __tgt_rtl_init_device(int32_t device_id) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
 
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error while trying to set CUDA current context\n");
-    CUDA_ERR_STRING(err);
-    return NULL;
-  }
+  return DeviceRTL.initDevice(device_id);
+}
 
-  CUdeviceptr ptr;
-  err = cuMemAlloc(&ptr, size);
-  if (err != CUDA_SUCCESS) {
-    DP("Error while trying to allocate %d\n", err);
-    CUDA_ERR_STRING(err);
-    return NULL;
-  }
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+                                          __tgt_device_image *image) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.loadBinary(device_id, image);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
 
-  void *vptr = (void *)ptr;
-  return vptr;
+  return DeviceRTL.dataAlloc(device_id, size);
 }
 
 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
                               int64_t size) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
   __tgt_async_info async_info;
-  int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, size,
-                                           &async_info);
+  const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
+                                                 size, &async_info);
   if (rc != OFFLOAD_SUCCESS)
     return OFFLOAD_FAIL;
 
@@ -856,15 +935,20 @@ int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
                                     void *hst_ptr, int64_t size,
                                     __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
   assert(async_info_ptr && "async_info_ptr is nullptr");
-  return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr);
+
+  return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
+                              async_info_ptr);
 }
 
 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
                                 int64_t size) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
   __tgt_async_info async_info;
-  int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, size,
-                                             &async_info);
+  const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
+                                                   size, &async_info);
   if (rc != OFFLOAD_SUCCESS)
     return OFFLOAD_FAIL;
 
@@ -874,26 +958,17 @@ int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
                                       void *tgt_ptr, int64_t size,
                                       __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
   assert(async_info_ptr && "async_info_ptr is nullptr");
-  return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr);
+
+  return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
+                                async_info_ptr);
 }
 
 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting CUDA context\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
 
-  err = cuMemFree((CUdeviceptr)tgt_ptr);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when freeing CUDA memory\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
+  return DeviceRTL.dataDelete(device_id, tgt_ptr);
 }
 
 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
@@ -902,8 +977,10 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
                                          int32_t arg_num, int32_t team_num,
                                          int32_t thread_limit,
                                          uint64_t loop_tripcount) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
   __tgt_async_info async_info;
-  int32_t rc = __tgt_rtl_run_target_team_region_async(
+  const int32_t rc = __tgt_rtl_run_target_team_region_async(
       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
       thread_limit, loop_tripcount, &async_info);
   if (rc != OFFLOAD_SUCCESS)
@@ -916,123 +993,21 @@ int32_t __tgt_rtl_run_target_team_region_async(
     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
     ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t team_num,
     int32_t thread_limit, uint64_t loop_tripcount,
-    __tgt_async_info *async_info) {
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting CUDA context\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
-
-  // All args are references.
-  std::vector<void *> args(arg_num);
-  std::vector<void *> ptrs(arg_num);
-
-  for (int32_t i = 0; i < arg_num; ++i) {
-    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
-    args[i] = &ptrs[i];
-  }
+    __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
 
-  KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
-
-  int cudaThreadsPerBlock;
-
-  if (thread_limit > 0) {
-    cudaThreadsPerBlock = thread_limit;
-    DP("Setting CUDA threads per block to requested %d\n", thread_limit);
-    // Add master warp if necessary
-    if (KernelInfo->ExecutionMode == GENERIC) {
-      cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
-      DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
-    }
-  } else {
-    cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
-    DP("Setting CUDA threads per block to default %d\n",
-        DeviceInfo.NumThreads[device_id]);
-  }
-
-  if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
-    cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
-    DP("Threads per block capped at device limit %d\n",
-        DeviceInfo.ThreadsPerBlock[device_id]);
-  }
-
-  int kernel_limit;
-  err = cuFuncGetAttribute(&kernel_limit,
-      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
-  if (err == CUDA_SUCCESS) {
-    if (kernel_limit < cudaThreadsPerBlock) {
-      cudaThreadsPerBlock = kernel_limit;
-      DP("Threads per block capped at kernel limit %d\n", kernel_limit);
-    }
-  }
-
-  int cudaBlocksPerGrid;
-  if (team_num <= 0) {
-    if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
-      if (KernelInfo->ExecutionMode == SPMD) {
-        // We have a combined construct, i.e. `target teams distribute parallel
-        // for [simd]`. We launch so many teams so that each thread will
-        // execute one iteration of the loop.
-        // round up to the nearest integer
-        cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
-      } else {
-        // If we reach this point, then we have a non-combined construct, i.e.
-        // `teams distribute` with a nested `parallel for` and each team is
-        // assigned one iteration of the `distribute` loop. E.g.:
-        //
-        // #pragma omp target teams distribute
-        // for(...loop_tripcount...) {
-        //   #pragma omp parallel for
-        //   for(...) {}
-        // }
-        //
-        // Threads within a team will execute the iterations of the `parallel`
-        // loop.
-        cudaBlocksPerGrid = loop_tripcount;
-      }
-      DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
-          "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
-          cudaThreadsPerBlock);
-    } else {
-      cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
-      DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
-    }
-  } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
-    cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
-    DP("Capping number of teams to team limit %d\n",
-        DeviceInfo.BlocksPerGrid[device_id]);
-  } else {
-    cudaBlocksPerGrid = team_num;
-    DP("Using requested number of teams %d\n", team_num);
-  }
-
-  // Run on the device.
-  DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
-     cudaThreadsPerBlock);
-
-  CUstream Stream = getStream(device_id, async_info);
-  err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
-                       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/,
-                       Stream, &args[0], 0);
-  if (err != CUDA_SUCCESS) {
-    DP("Device kernel launch failed!\n");
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
-
-  DP("Launch of entry point at " DPxMOD " successful!\n",
-     DPxPTR(tgt_entry_ptr));
-
-  return OFFLOAD_SUCCESS;
+  return DeviceRTL.runTargetTeamRegion(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
+      thread_limit, loop_tripcount, async_info_ptr);
 }
 
 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
                                     void **tgt_args, ptr
diff _t *tgt_offsets,
                                     int32_t arg_num) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
   __tgt_async_info async_info;
-  int32_t rc = __tgt_rtl_run_target_region_async(
+  const int32_t rc = __tgt_rtl_run_target_region_async(
       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
   if (rc != OFFLOAD_SUCCESS)
     return OFFLOAD_FAIL;
@@ -1044,35 +1019,22 @@ int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
                                           void *tgt_entry_ptr, void **tgt_args,
                                           ptr
diff _t *tgt_offsets,
                                           int32_t arg_num,
-                                          __tgt_async_info *async_info) {
-  // use one team and the default number of threads.
-  const int32_t team_num = 1;
-  const int32_t thread_limit = 0;
+                                          __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
   return __tgt_rtl_run_target_team_region_async(
-      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
-      thread_limit, 0, async_info);
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
+      /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
+      async_info_ptr);
 }
 
-int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) {
-  assert(async_info && "async_info is nullptr");
-  assert(async_info->Queue && "async_info->Queue is nullptr");
-
-  CUstream Stream = reinterpret_cast<CUstream>(async_info->Queue);
-  CUresult Err = cuStreamSynchronize(Stream);
-  if (Err != CUDA_SUCCESS) {
-    DP("Error when synchronizing stream. stream = " DPxMOD
-       ", async info ptr = " DPxMOD "\n",
-       DPxPTR(Stream), DPxPTR(async_info));
-    CUDA_ERR_STRING(Err);
-    return OFFLOAD_FAIL;
-  }
-
-  // Once the stream is synchronized, return it to stream pool and reset
-  // async_info. This is to make sure the synchronization only works for its own
-  // tasks.
-  DeviceInfo.returnStream(device_id, async_info);
+int32_t __tgt_rtl_synchronize(int32_t device_id,
+                              __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+  assert(async_info_ptr && "async_info_ptr is nullptr");
+  assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
 
-  return OFFLOAD_SUCCESS;
+  return DeviceRTL.synchronize(device_id, async_info_ptr);
 }
 
 #ifdef __cplusplus