[Openmp-commits] [openmp] 4031bb9 - [OpenMP] Refined CUDA plugin to put all CUDA operations into class
Shilei Tian via Openmp-commits
openmp-commits at lists.llvm.org
Mon Apr 13 10:32:52 PDT 2020
Author: Shilei Tian
Date: 2020-04-13T13:32:46-04:00
New Revision: 4031bb982b7a9bf8603851516ad72374ccc09a6f
URL: https://github.com/llvm/llvm-project/commit/4031bb982b7a9bf8603851516ad72374ccc09a6f
DIFF: https://github.com/llvm/llvm-project/commit/4031bb982b7a9bf8603851516ad72374ccc09a6f.diff
LOG: [OpenMP] Refined CUDA plugin to put all CUDA operations into class
Summary: Current implementation mixed everything up so that there is almost no encapsulation. In this patch, all CUDA related operations are put into a new class DeviceRTLTy and only necessary functions are exposed. In addition, all C++ code now conforms with LLVM code standard, keeping those API functions following C style.
Reviewers: jdoerfert
Reviewed By: jdoerfert
Subscribers: jfb, yaxunl, guansong, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D77951
Added:
Modified:
openmp/libomptarget/plugins/cuda/src/rtl.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index 4c38ee8c2d28..4ad58e290252 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -10,7 +10,6 @@
//
//===----------------------------------------------------------------------===//
-#include <atomic>
#include <cassert>
#include <cstddef>
#include <cuda.h>
@@ -62,7 +61,7 @@ struct FuncOrGblEntryTy {
enum ExecutionModeType {
SPMD, // constructors, destructors,
- // combined constructs (`teams distribute parallel for [simd]`)
+ // combined constructs (`teams distribute parallel for [simd]`)
GENERIC, // everything else
NONE
};
@@ -100,18 +99,32 @@ bool checkResult(CUresult Err, const char *ErrMsg) {
CUDA_ERR_STRING(Err);
return false;
}
-} // namespace
+
+// Structure contains per-device data
+struct DeviceDataTy {
+ std::list<FuncOrGblEntryTy> FuncGblEntries;
+ CUcontext Context = nullptr;
+ // Device properties
+ int ThreadsPerBlock = 0;
+ int BlocksPerGrid = 0;
+ int WarpSize = 0;
+ // OpenMP properties
+ int NumTeams = 0;
+ int NumThreads = 0;
+};
class StreamManagerTy {
int NumberOfDevices;
+ // The initial size of stream pool
+ int EnvNumInitialStreams;
// Per-device stream mutex
std::vector<std::unique_ptr<std::mutex>> StreamMtx;
// Per-device stream Id indicates the next available stream in the pool
std::vector<int> NextStreamId;
// Per-device stream pool
std::vector<std::vector<CUstream>> StreamPool;
- // Pointer to per-device context
- std::vector<CUcontext> &ContextsPtr;
+ // Reference to per-device data
+ std::vector<DeviceDataTy> &DeviceData;
// If there is no CUstream left in the pool, we will resize the pool to
// allocate more CUstream. This function should be called with device mutex,
@@ -121,10 +134,8 @@ class StreamManagerTy {
const size_t CurrentSize = Pool.size();
assert(NewSize > CurrentSize && "new size is not larger than current size");
- Pool.resize(NewSize, nullptr);
-
- CUresult err = cuCtxSetCurrent(ContextsPtr[DeviceId]);
- if (!checkResult(err, "Error when setting current CUDA context\n")) {
+ CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
// We will return if cannot switch to the right context in case of
// creating bunch of streams that are not corresponding to the right
// device. The offloading will fail later because selected CUstream is
@@ -132,29 +143,25 @@ class StreamManagerTy {
return;
}
+ Pool.resize(NewSize, nullptr);
+
for (size_t I = CurrentSize; I < NewSize; ++I) {
- err = cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING);
- checkResult(err,
- "Error when creating CUDA stream to resize stream pool\n");
+ checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
+ "Error returned from cuStreamCreate\n");
}
}
public:
- StreamManagerTy(const int NumberOfDevices, std::vector<CUcontext> &CtxPtr)
- : NumberOfDevices(NumberOfDevices), ContextsPtr(CtxPtr) {
+ StreamManagerTy(const int NumberOfDevices,
+ std::vector<DeviceDataTy> &DeviceData)
+ : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
+ DeviceData(DeviceData) {
StreamPool.resize(NumberOfDevices);
NextStreamId.resize(NumberOfDevices);
StreamMtx.resize(NumberOfDevices);
- // Initially let's create 32 streams for each device
- int EnvNumInitialStreams = 32;
- char *envStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS");
- if (envStr)
- EnvNumInitialStreams = std::stoi(envStr);
-
- // Initialize the stream pool for each device
- for (std::vector<CUstream> &S : StreamPool)
- S.resize(EnvNumInitialStreams);
+ if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
+ EnvNumInitialStreams = std::stoi(EnvStr);
// Initialize the next stream id
std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
@@ -167,14 +174,13 @@ class StreamManagerTy {
~StreamManagerTy() {
// Destroy streams
for (int I = 0; I < NumberOfDevices; ++I) {
- CUresult err = cuCtxSetCurrent(ContextsPtr[I]);
- checkResult(err, "Error when setting current CUDA context\n");
+ checkResult(cuCtxSetCurrent(DeviceData[I].Context),
+ "Error returned from cuCtxSetCurrent\n");
for (CUstream &S : StreamPool[I]) {
- if (!S)
- continue;
- err = cuStreamDestroy(S);
- checkResult(err, "Error when destroying CUDA stream\n");
+ if (S)
+ checkResult(cuStreamDestroy(S),
+ "Error returned from cuStreamDestroy\n");
}
}
}
@@ -192,10 +198,6 @@ class StreamManagerTy {
// ^
// id
CUstream getStream(const int DeviceId) {
- assert(DeviceId >= 0 &&
- static_cast<size_t>(DeviceId) < NextStreamId.size() &&
- "Unexpected device id");
-
const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
int &Id = NextStreamId[DeviceId];
// No CUstream left in the pool, we need to request from CUDA RT
@@ -221,632 +223,709 @@ class StreamManagerTy {
// Therefore, after several execution, the order of pool might be
diff erent
// from its initial state.
void returnStream(const int DeviceId, CUstream Stream) {
- assert(DeviceId >= 0 &&
- static_cast<size_t>(DeviceId) < NextStreamId.size() &&
- "Unexpected device id");
-
const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
int &Id = NextStreamId[DeviceId];
assert(Id > 0 && "Wrong stream ID");
StreamPool[DeviceId][--Id] = Stream;
}
- void initializeDevice(int DeviceId) {
- // This function should be called after setting right context
- for (CUstream &Stream : StreamPool[DeviceId]) {
- CUresult Err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
- checkResult(Err, "Error when creating CUDA stream\n");
- }
- }
-};
+ bool initializeDeviceStreamPool(const int DeviceId) {
+ assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
-/// Class containing all the device information.
-class RTLDeviceInfoTy {
- std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
- std::shared_ptr<StreamManagerTy> StreamManager;
+ resizeStreamPool(DeviceId, EnvNumInitialStreams);
-public:
- int NumberOfDevices;
- std::vector<CUmodule> Modules;
- std::vector<CUcontext> Contexts;
+ // Check the size of stream pool
+ if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
+ return false;
- // Device properties
- std::vector<int> ThreadsPerBlock;
- std::vector<int> BlocksPerGrid;
- std::vector<int> WarpSize;
+ // Check whether each stream is valid
+ for (CUstream &S : StreamPool[DeviceId])
+ if (!S)
+ return false;
- // OpenMP properties
- std::vector<int> NumTeams;
- std::vector<int> NumThreads;
+ return true;
+ }
+};
- // OpenMP Environment properties
+class DeviceRTLTy {
+ int NumberOfDevices;
+ // OpenMP environment properties
int EnvNumTeams;
int EnvTeamLimit;
-
- // OpenMP Requires Flags
+ // OpenMP requires flags
int64_t RequiresFlags;
- // static int EnvNumThreads;
- static const int HardTeamLimit = 1 << 16; // 64k
- static const int HardThreadLimit = 1024;
- static const int DefaultNumTeams = 128;
- static const int DefaultNumThreads = 128;
-
- std::shared_ptr<StreamManagerTy> getStreamManager() { return StreamManager; }
-
- CUstream getStream(const int DeviceId) {
- return StreamManager->getStream(DeviceId);
- }
+ static constexpr const int HardTeamLimit = 1U << 16U; // 64k
+ static constexpr const int HardThreadLimit = 1024;
+ static constexpr const int DefaultNumTeams = 128;
+ static constexpr const int DefaultNumThreads = 128;
- void returnStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) {
- assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
- assert(AsyncInfoPtr->Queue && "AsyncInfoPtr->Queue is nullptr");
-
- StreamManager->returnStream(
- DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
- AsyncInfoPtr->Queue = nullptr;
- }
+ std::unique_ptr<StreamManagerTy> StreamManager;
+ std::vector<DeviceDataTy> DeviceData;
+ std::vector<CUmodule> Modules;
// Record entry point associated with device
- void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
- assert(device_id < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
-
+ void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
+ FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
E.Entries.push_back(entry);
}
// Return true if the entry is associated with device
- bool findOffloadEntry(int32_t device_id, void *addr) {
- assert(device_id < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
-
- for (auto &it : E.Entries) {
- if (it.addr == addr)
+ bool findOffloadEntry(const int DeviceId, const void *Addr) const {
+ for (const __tgt_offload_entry &Itr :
+ DeviceData[DeviceId].FuncGblEntries.back().Entries)
+ if (Itr.addr == Addr)
return true;
- }
return false;
}
// Return the pointer to the target entries table
- __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
- assert(device_id < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
-
- int32_t size = E.Entries.size();
+ __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
+ FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
- // Table is empty
- if (!size)
- return 0;
-
- __tgt_offload_entry *begin = &E.Entries[0];
- __tgt_offload_entry *end = &E.Entries[size - 1];
+ if (E.Entries.empty())
+ return nullptr;
// Update table info according to the entries and return the pointer
- E.Table.EntriesBegin = begin;
- E.Table.EntriesEnd = ++end;
+ E.Table.EntriesBegin = E.Entries.data();
+ E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
return &E.Table;
}
// Clear entries table for a device
- void clearOffloadEntriesTable(int32_t device_id) {
- assert(device_id < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncGblEntries[device_id].emplace_back();
- FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+ void clearOffloadEntriesTable(const int DeviceId) {
+ DeviceData[DeviceId].FuncGblEntries.emplace_back();
+ FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
E.Entries.clear();
- E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
+ E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
+ }
+
+ CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
+ assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+
+ if (!AsyncInfoPtr->Queue)
+ AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
+
+ return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
}
- RTLDeviceInfoTy() {
+public:
+ // This class should not be copied
+ DeviceRTLTy(const DeviceRTLTy &) = delete;
+ DeviceRTLTy(DeviceRTLTy &&) = delete;
+
+ DeviceRTLTy()
+ : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
+ RequiresFlags(OMP_REQ_UNDEFINED) {
#ifdef OMPTARGET_DEBUG
- if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
- DebugLevel = std::stoi(envStr);
- }
+ if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
+ DebugLevel = std::stoi(EnvStr);
#endif // OMPTARGET_DEBUG
DP("Start initializing CUDA\n");
- CUresult err = cuInit(0);
- if (err != CUDA_SUCCESS) {
- DP("Error when initializing CUDA\n");
- CUDA_ERR_STRING(err);
+ CUresult Err = cuInit(0);
+ if (!checkResult(Err, "Error returned from cuInit\n")) {
return;
}
- NumberOfDevices = 0;
-
- err = cuDeviceGetCount(&NumberOfDevices);
- if (err != CUDA_SUCCESS) {
- DP("Error when getting CUDA device count\n");
- CUDA_ERR_STRING(err);
+ Err = cuDeviceGetCount(&NumberOfDevices);
+ if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
return;
- }
if (NumberOfDevices == 0) {
DP("There are no devices supporting CUDA.\n");
return;
}
- FuncGblEntries.resize(NumberOfDevices);
- Contexts.resize(NumberOfDevices);
- ThreadsPerBlock.resize(NumberOfDevices);
- BlocksPerGrid.resize(NumberOfDevices);
- WarpSize.resize(NumberOfDevices);
- NumTeams.resize(NumberOfDevices);
- NumThreads.resize(NumberOfDevices);
+ DeviceData.resize(NumberOfDevices);
// Get environment variables regarding teams
- char *envStr = getenv("OMP_TEAM_LIMIT");
- if (envStr) {
+ if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
// OMP_TEAM_LIMIT has been set
- EnvTeamLimit = std::stoi(envStr);
+ EnvTeamLimit = std::stoi(EnvStr);
DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
- } else {
- EnvTeamLimit = -1;
}
- envStr = getenv("OMP_NUM_TEAMS");
- if (envStr) {
+ if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
// OMP_NUM_TEAMS has been set
- EnvNumTeams = std::stoi(envStr);
+ EnvNumTeams = std::stoi(EnvStr);
DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
- } else {
- EnvNumTeams = -1;
}
StreamManager =
- std::make_shared<StreamManagerTy>(NumberOfDevices, Contexts);
-
- // Default state.
- RequiresFlags = OMP_REQ_UNDEFINED;
+ std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
}
- ~RTLDeviceInfoTy() {
+ ~DeviceRTLTy() {
// First destruct stream manager in case of Contexts is destructed before it
StreamManager = nullptr;
- // Close modules
- for (auto &module : Modules)
- if (module) {
- CUresult err = cuModuleUnload(module);
- if (err != CUDA_SUCCESS) {
- DP("Error when unloading CUDA module\n");
- CUDA_ERR_STRING(err);
- }
- }
+ for (CUmodule &M : Modules)
+ // Close module
+ if (M)
+ checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
- // Destroy contexts
- for (auto &ctx : Contexts)
- if (ctx) {
- CUresult err = cuCtxDestroy(ctx);
- if (err != CUDA_SUCCESS) {
- DP("Error when destroying CUDA context\n");
- CUDA_ERR_STRING(err);
- }
- }
+ for (DeviceDataTy &D : DeviceData) {
+ // Destroy context
+ if (D.Context)
+ checkResult(cuCtxDestroy(D.Context),
+ "Error returned from cuCtxDestroy\n");
+ }
}
-};
-
-static RTLDeviceInfoTy DeviceInfo;
-namespace {
-CUstream getStream(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr) {
- assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+ // Check whether a given DeviceId is valid
+ bool isValidDeviceId(const int DeviceId) const {
+ return DeviceId >= 0 && DeviceId < NumberOfDevices;
+ }
- if (!AsyncInfoPtr->Queue)
- AsyncInfoPtr->Queue = DeviceInfo.getStream(DeviceId);
+ bool getNumOfDevices() const { return NumberOfDevices; }
+
+ void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
+
+ int initDevice(const int DeviceId) {
+ CUdevice Device;
+
+ DP("Getting device %d\n", DeviceId);
+ CUresult Err = cuDeviceGet(&Device, DeviceId);
+ if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
+ return OFFLOAD_FAIL;
+
+ // Create the context and save it to use whenever this device is selected.
+ Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC,
+ Device);
+ if (!checkResult(Err, "Error returned from cuCtxCreate\n"))
+ return OFFLOAD_FAIL;
+
+ Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+ return OFFLOAD_FAIL;
+
+ // Initialize stream pool
+ if (!StreamManager->initializeDeviceStreamPool(DeviceId))
+ return OFFLOAD_FAIL;
+
+ // Query attributes to determine number of threads/block and blocks/grid.
+ int MaxGridDimX;
+ Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+ Device);
+ if (Err != CUDA_SUCCESS) {
+ DP("Error getting max grid dimension, use default value %d\n",
+ DeviceRTLTy::DefaultNumTeams);
+ DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
+ } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
+ DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
+ DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
+ } else {
+ DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+ "at the hard limit\n",
+ MaxGridDimX, DeviceRTLTy::HardTeamLimit);
+ DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
+ }
- return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
-}
+ // We are only exploiting threads along the x axis.
+ int MaxBlockDimX;
+ Err = cuDeviceGetAttribute(&MaxBlockDimX,
+ CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
+ if (Err != CUDA_SUCCESS) {
+ DP("Error getting max block dimension, use default value %d\n",
+ DeviceRTLTy::DefaultNumThreads);
+ DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
+ } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
+ DP("Using %d CUDA threads per block\n", MaxBlockDimX);
+ DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
+ } else {
+ DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
+ "capping at the hard limit\n",
+ MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
+ DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
+ }
-int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
- __tgt_async_info *AsyncInfoPtr) {
- assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
- // Set the context we are using.
- CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
- if (err != CUDA_SUCCESS) {
- DP("Error when setting CUDA context\n");
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
+ // Get and set warp size
+ int WarpSize;
+ Err =
+ cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
+ if (Err != CUDA_SUCCESS) {
+ DP("Error getting warp size, assume default value 32\n");
+ DeviceData[DeviceId].WarpSize = 32;
+ } else {
+ DP("Using warp size %d\n", WarpSize);
+ DeviceData[DeviceId].WarpSize = WarpSize;
+ }
- CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
+ // Adjust teams to the env variables
+ if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
+ DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
+ EnvTeamLimit);
+ DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
+ }
- err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
- if (err != CUDA_SUCCESS) {
- DP("Error when copying data from device to host. Pointers: host = " DPxMOD
- ", device = " DPxMOD ", size = %" PRId64 "\n",
- DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
+ DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
+ DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock,
+ DeviceData[DeviceId].WarpSize);
- return OFFLOAD_SUCCESS;
-}
+ // Set default number of teams
+ if (EnvNumTeams > 0) {
+ DP("Default number of teams set according to environment %d\n",
+ EnvNumTeams);
+ DeviceData[DeviceId].NumTeams = EnvNumTeams;
+ } else {
+ DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
+ DP("Default number of teams set according to library's default %d\n",
+ DeviceRTLTy::DefaultNumTeams);
+ }
-int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
- __tgt_async_info *AsyncInfoPtr) {
- assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
- // Set the context we are using.
- CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
- if (err != CUDA_SUCCESS) {
- DP("Error when setting CUDA context\n");
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
+ if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
+ DP("Default number of teams exceeds device limit, capping at %d\n",
+ DeviceData[DeviceId].BlocksPerGrid);
+ DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
+ }
- CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
+ // Set default number of threads
+ DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
+ DP("Default number of threads set according to library's default %d\n",
+ DeviceRTLTy::DefaultNumThreads);
+ if (DeviceData[DeviceId].NumThreads >
+ DeviceData[DeviceId].ThreadsPerBlock) {
+ DP("Default number of threads exceeds device limit, capping at %d\n",
+ DeviceData[DeviceId].ThreadsPerBlock);
+ DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
+ }
- err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
- if (err != CUDA_SUCCESS) {
- DP("Error when copying data from host to device. Pointers: host = " DPxMOD
- ", device = " DPxMOD ", size = %" PRId64 "\n",
- DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
+ return OFFLOAD_SUCCESS;
}
- return OFFLOAD_SUCCESS;
-}
-} // namespace
+ __tgt_target_table *loadBinary(const int DeviceId,
+ const __tgt_device_image *Image) {
+ // Set the context we are using
+ CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+ return nullptr;
+
+ // Clear the offload table as we are going to create a new one.
+ clearOffloadEntriesTable(DeviceId);
+
+ // Create the module and extract the function pointers.
+ CUmodule Module;
+ DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
+ Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
+ if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
+ return nullptr;
+
+ DP("CUDA module successfully loaded!\n");
+
+ Modules.push_back(Module);
+
+ // Find the symbols in the module by name.
+ const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
+ const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
+
+ for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
+ if (!E->addr) {
+ // We return nullptr when something like this happens, the host should
+ // have always something in the address to uniquely identify the target
+ // region.
+ DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
+ return nullptr;
+ }
-#ifdef __cplusplus
-extern "C" {
-#endif
+ if (E->size) {
+ __tgt_offload_entry Entry = *E;
+ CUdeviceptr CUPtr;
+ size_t CUSize;
+ Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
+ // We keep this style here because we need the name
+ if (Err != CUDA_SUCCESS) {
+ DP("Loading global '%s' (Failed)\n", E->name);
+ CUDA_ERR_STRING(Err);
+ return nullptr;
+ }
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
- return elf_check_machine(image, 190); // EM_CUDA = 190.
-}
+ if (CUSize != E->size) {
+ DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
+ CUSize, E->size);
+ return nullptr;
+ }
-int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
+ DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+ DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
+
+ Entry.addr = (void *)(CUPtr);
+
+ // Note: In the current implementation declare target variables
+ // can either be link or to. This means that once unified
+ // memory is activated via the requires directive, the variable
+ // can be used directly from the host in both cases.
+ // TODO: when variables types other than to or link are added,
+ // the below condition should be changed to explicitly
+ // check for to and link variables types:
+ // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
+ // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
+ if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+ // If unified memory is present any target link or to variables
+ // can access host addresses directly. There is no longer a
+ // need for device copies.
+ cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
+ DP("Copy linked variable host address (" DPxMOD
+ ") to device address (" DPxMOD ")\n",
+ DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
+ }
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
- DP("Init requires flags to %ld\n", RequiresFlags);
- DeviceInfo.RequiresFlags = RequiresFlags;
- return RequiresFlags;
-}
+ addOffloadEntry(DeviceId, Entry);
-int32_t __tgt_rtl_init_device(int32_t device_id) {
+ continue;
+ }
- CUdevice cuDevice;
- DP("Getting device %d\n", device_id);
- CUresult err = cuDeviceGet(&cuDevice, device_id);
- if (err != CUDA_SUCCESS) {
- DP("Error when getting CUDA device with id = %d\n", device_id);
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
+ CUfunction Func;
+ Err = cuModuleGetFunction(&Func, Module, E->name);
+ // We keep this style here because we need the name
+ if (Err != CUDA_SUCCESS) {
+ DP("Loading '%s' (Failed)\n", E->name);
+ CUDA_ERR_STRING(Err);
+ return nullptr;
+ }
- // Create the context and save it to use whenever this device is selected.
- err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
- cuDevice);
- if (err != CUDA_SUCCESS) {
- DP("Error when creating a CUDA context\n");
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
+ DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
+ DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
+
+ // default value GENERIC (in case symbol is missing from cubin file)
+ int8_t ExecModeVal = ExecutionModeType::GENERIC;
+ std::string ExecModeNameStr(E->name);
+ ExecModeNameStr += "_exec_mode";
+ const char *ExecModeName = ExecModeNameStr.c_str();
+
+ CUdeviceptr ExecModePtr;
+ size_t CUSize;
+ Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
+ if (Err == CUDA_SUCCESS) {
+ if (CUSize != sizeof(int8_t)) {
+ DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
+ ExecModeName, CUSize, sizeof(int8_t));
+ return nullptr;
+ }
- err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
- if (err != CUDA_SUCCESS) {
- DP("Error when setting current CUDA context\n");
- CUDA_ERR_STRING(err);
- }
+ Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
+ if (Err != CUDA_SUCCESS) {
+ DP("Error when copying data from device to host. Pointers: "
+ "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
+ DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
+ CUDA_ERR_STRING(Err);
+ return nullptr;
+ }
- // Initialize stream pool
- DeviceInfo.getStreamManager()->initializeDevice(device_id);
-
- // Query attributes to determine number of threads/block and blocks/grid.
- int maxGridDimX;
- err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
- cuDevice);
- if (err != CUDA_SUCCESS) {
- DP("Error getting max grid dimension, use default\n");
- DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
- } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
- DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
- DP("Using %d CUDA blocks per grid\n", maxGridDimX);
- } else {
- DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
- DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
- "at the hard limit\n",
- maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
- }
+ if (ExecModeVal < 0 || ExecModeVal > 1) {
+ DP("Error wrong exec_mode value specified in cubin file: %d\n",
+ ExecModeVal);
+ return nullptr;
+ }
+ } else {
+ DP("Loading global exec_mode '%s' - symbol missing, using default "
+ "value GENERIC (1)\n",
+ ExecModeName);
+ CUDA_ERR_STRING(Err);
+ }
- // We are only exploiting threads along the x axis.
- int maxBlockDimX;
- err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
- cuDevice);
- if (err != CUDA_SUCCESS) {
- DP("Error getting max block dimension, use default\n");
- DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
- } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
- DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
- DP("Using %d CUDA threads per block\n", maxBlockDimX);
- } else {
- DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
- DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
- "at the hard limit\n",
- maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
- }
+ KernelsList.emplace_back(Func, ExecModeVal);
- int warpSize;
- err =
- cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
- if (err != CUDA_SUCCESS) {
- DP("Error getting warp size, assume default\n");
- DeviceInfo.WarpSize[device_id] = 32;
- } else {
- DeviceInfo.WarpSize[device_id] = warpSize;
- }
+ __tgt_offload_entry Entry = *E;
+ Entry.addr = &KernelsList.back();
+ addOffloadEntry(DeviceId, Entry);
+ }
- // Adjust teams to the env variables
- if (DeviceInfo.EnvTeamLimit > 0 &&
- DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
- DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
- DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
- DeviceInfo.EnvTeamLimit);
- }
+ // send device environment data to the device
+ {
+ omptarget_device_environmentTy DeviceEnv{0};
- DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
- DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
- DeviceInfo.WarpSize[device_id]);
-
- // Set default number of teams
- if (DeviceInfo.EnvNumTeams > 0) {
- DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
- DP("Default number of teams set according to environment %d\n",
- DeviceInfo.EnvNumTeams);
- } else {
- DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
- DP("Default number of teams set according to library's default %d\n",
- RTLDeviceInfoTy::DefaultNumTeams);
- }
- if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
- DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
- DP("Default number of teams exceeds device limit, capping at %d\n",
- DeviceInfo.BlocksPerGrid[device_id]);
- }
+#ifdef OMPTARGET_DEBUG
+ if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
+ DeviceEnv.debug_level = std::stoi(EnvStr);
+#endif
- // Set default number of threads
- DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
- DP("Default number of threads set according to library's default %d\n",
- RTLDeviceInfoTy::DefaultNumThreads);
- if (DeviceInfo.NumThreads[device_id] >
- DeviceInfo.ThreadsPerBlock[device_id]) {
- DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
- DP("Default number of threads exceeds device limit, capping at %d\n",
- DeviceInfo.ThreadsPerBlock[device_id]);
- }
+ const char *DeviceEnvName = "omptarget_device_environment";
+ CUdeviceptr DeviceEnvPtr;
+ size_t CUSize;
+
+ Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
+ if (Err == CUDA_SUCCESS) {
+ if (CUSize != sizeof(DeviceEnv)) {
+ DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
+ DeviceEnvName, CUSize, sizeof(int32_t));
+ CUDA_ERR_STRING(Err);
+ return nullptr;
+ }
- return OFFLOAD_SUCCESS;
-}
+ Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
+ if (Err != CUDA_SUCCESS) {
+ DP("Error when copying data from host to device. Pointers: "
+ "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
+ DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
+ CUDA_ERR_STRING(Err);
+ return nullptr;
+ }
-__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
- __tgt_device_image *image) {
-
- // Set the context we are using.
- CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
- if (err != CUDA_SUCCESS) {
- DP("Error when setting a CUDA context for device %d\n", device_id);
- CUDA_ERR_STRING(err);
- return NULL;
+ DP("Sending global device environment data %zu bytes\n", CUSize);
+ } else {
+ DP("Finding global device environment '%s' - symbol missing.\n",
+ DeviceEnvName);
+ DP("Continue, considering this is a device RTL which does not accept "
+ "environment setting.\n");
+ }
+ }
+
+ return getOffloadEntriesTable(DeviceId);
}
- // Clear the offload table as we are going to create a new one.
- DeviceInfo.clearOffloadEntriesTable(device_id);
+ void *dataAlloc(const int DeviceId, const int64_t Size) const {
+ if (Size == 0)
+ return nullptr;
- // Create the module and extract the function pointers.
+ CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+ return nullptr;
- CUmodule cumod;
- DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
- err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
- if (err != CUDA_SUCCESS) {
- DP("Error when loading CUDA module\n");
- CUDA_ERR_STRING(err);
- return NULL;
- }
+ CUdeviceptr DevicePtr;
+ Err = cuMemAlloc(&DevicePtr, Size);
+ if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
+ return nullptr;
- DP("CUDA module successfully loaded!\n");
- DeviceInfo.Modules.push_back(cumod);
+ return (void *)DevicePtr;
+ }
- // Find the symbols in the module by name.
- __tgt_offload_entry *HostBegin = image->EntriesBegin;
- __tgt_offload_entry *HostEnd = image->EntriesEnd;
+ int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
+ const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+ assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
- for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+ CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+ return OFFLOAD_FAIL;
- if (!e->addr) {
- // We return NULL when something like this happens, the host should have
- // always something in the address to uniquely identify the target region.
- DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
+ CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
- return NULL;
+ Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
+ if (Err != CUDA_SUCCESS) {
+ DP("Error when copying data from host to device. Pointers: host = " DPxMOD
+ ", device = " DPxMOD ", size = %" PRId64 "\n",
+ DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+ CUDA_ERR_STRING(Err);
+ return OFFLOAD_FAIL;
}
- if (e->size) {
- __tgt_offload_entry entry = *e;
+ return OFFLOAD_SUCCESS;
+ }
+
+ int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
+ const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+ assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
- CUdeviceptr cuptr;
- size_t cusize;
- err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
+ CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+ return OFFLOAD_FAIL;
- if (err != CUDA_SUCCESS) {
- DP("Loading global '%s' (Failed)\n", e->name);
- CUDA_ERR_STRING(err);
- return NULL;
- }
+ CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
- if (cusize != e->size) {
- DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
- cusize, e->size);
- CUDA_ERR_STRING(err);
- return NULL;
- }
+ Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
+ if (Err != CUDA_SUCCESS) {
+ DP("Error when copying data from device to host. Pointers: host = " DPxMOD
+ ", device = " DPxMOD ", size = %" PRId64 "\n",
+ DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+ CUDA_ERR_STRING(Err);
+ return OFFLOAD_FAIL;
+ }
- DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
- DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
- entry.addr = (void *)cuptr;
-
- // Note: In the current implementation declare target variables
- // can either be link or to. This means that once unified
- // memory is activated via the requires directive, the variable
- // can be used directly from the host in both cases.
- // TODO: when variables types other than to or link are added,
- // the below condition should be changed to explicitly
- // check for to and link variables types:
- // (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
- // (e->flags & OMP_DECLARE_TARGET_LINK ||
- // e->flags == OMP_DECLARE_TARGET_TO))
- if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
- // If unified memory is present any target link or to variables
- // can access host addresses directly. There is no longer a
- // need for device copies.
- cuMemcpyHtoD(cuptr, e->addr, sizeof(void *));
- DP("Copy linked variable host address (" DPxMOD ")"
- "to device address (" DPxMOD ")\n",
- DPxPTR(*((void**)e->addr)), DPxPTR(cuptr));
- }
+ return OFFLOAD_SUCCESS;
+ }
- DeviceInfo.addOffloadEntry(device_id, entry);
+ int dataDelete(const int DeviceId, void *TgtPtr) const {
+ CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+ return OFFLOAD_FAIL;
- continue;
- }
+ Err = cuMemFree((CUdeviceptr)TgtPtr);
+ if (!checkResult(Err, "Error returned from cuMemFree\n"))
+ return OFFLOAD_FAIL;
- CUfunction fun;
- err = cuModuleGetFunction(&fun, cumod, e->name);
+ return OFFLOAD_SUCCESS;
+ }
- if (err != CUDA_SUCCESS) {
- DP("Loading '%s' (Failed)\n", e->name);
- CUDA_ERR_STRING(err);
- return NULL;
+ int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,
+ void **TgtArgs, ptr
diff _t *TgtOffsets,
+ const int ArgNum, const int TeamNum,
+ const int ThreadLimit,
+ const unsigned int LoopTripCount,
+ __tgt_async_info *AsyncInfo) const {
+ CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+ if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+ return OFFLOAD_FAIL;
+
+ // All args are references.
+ std::vector<void *> Args(ArgNum);
+ std::vector<void *> Ptrs(ArgNum);
+
+ for (int I = 0; I < ArgNum; ++I) {
+ Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
+ Args[I] = &Ptrs[I];
}
- DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
- DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
-
- // default value GENERIC (in case symbol is missing from cubin file)
- int8_t ExecModeVal = ExecutionModeType::GENERIC;
- std::string ExecModeNameStr (e->name);
- ExecModeNameStr += "_exec_mode";
- const char *ExecModeName = ExecModeNameStr.c_str();
-
- CUdeviceptr ExecModePtr;
- size_t cusize;
- err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
- if (err == CUDA_SUCCESS) {
- if ((size_t)cusize != sizeof(int8_t)) {
- DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
- ExecModeName, cusize, sizeof(int8_t));
- CUDA_ERR_STRING(err);
- return NULL;
+ const KernelTy *KernelInfo =
+ reinterpret_cast<const KernelTy *>(TgtEntryPtr);
+
+ unsigned int CudaThreadsPerBlock;
+ if (ThreadLimit > 0) {
+ DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
+ CudaThreadsPerBlock = ThreadLimit;
+ // Add master warp if necessary
+ if (KernelInfo->ExecutionMode == GENERIC) {
+ DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
+ CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
}
+ } else {
+ DP("Setting CUDA threads per block to default %d\n",
+ DeviceData[DeviceId].NumThreads);
+ CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
+ }
- err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
- if (err != CUDA_SUCCESS) {
- DP("Error when copying data from device to host. Pointers: "
- "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
- DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
- CUDA_ERR_STRING(err);
- return NULL;
- }
+ if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
+ DP("Threads per block capped at device limit %d\n",
+ DeviceData[DeviceId].ThreadsPerBlock);
+ CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
+ }
+
+ int KernelLimit;
+ Err = cuFuncGetAttribute(&KernelLimit,
+ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+ KernelInfo->Func);
+ if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {
+ DP("Threads per block capped at kernel limit %d\n", KernelLimit);
+ CudaThreadsPerBlock = KernelLimit;
+ }
- if (ExecModeVal < 0 || ExecModeVal > 1) {
- DP("Error wrong exec_mode value specified in cubin file: %d\n",
- ExecModeVal);
- return NULL;
+ unsigned int CudaBlocksPerGrid;
+ if (TeamNum <= 0) {
+ if (LoopTripCount > 0 && EnvNumTeams < 0) {
+ if (KernelInfo->ExecutionMode == SPMD) {
+ // We have a combined construct, i.e. `target teams distribute
+ // parallel for [simd]`. We launch so many teams so that each thread
+ // will execute one iteration of the loop. round up to the nearest
+ // integer
+ CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
+ } else {
+ // If we reach this point, then we have a non-combined construct, i.e.
+ // `teams distribute` with a nested `parallel for` and each team is
+ // assigned one iteration of the `distribute` loop. E.g.:
+ //
+ // #pragma omp target teams distribute
+ // for(...loop_tripcount...) {
+ // #pragma omp parallel for
+ // for(...) {}
+ // }
+ //
+ // Threads within a team will execute the iterations of the `parallel`
+ // loop.
+ CudaBlocksPerGrid = LoopTripCount;
+ }
+ DP("Using %d teams due to loop trip count %" PRIu64
+ " and number of threads per block %d\n",
+ CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
+ } else {
+ DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
+ CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
}
+ } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
+ DP("Capping number of teams to team limit %d\n",
+ DeviceData[DeviceId].BlocksPerGrid);
+ CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
} else {
- DP("Loading global exec_mode '%s' - symbol missing, using default value "
- "GENERIC (1)\n", ExecModeName);
- CUDA_ERR_STRING(err);
+ DP("Using requested number of teams %d\n", TeamNum);
+ CudaBlocksPerGrid = TeamNum;
}
- KernelsList.push_back(KernelTy(fun, ExecModeVal));
+ // Run on the device.
+ DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid,
+ CudaThreadsPerBlock);
- __tgt_offload_entry entry = *e;
- entry.addr = (void *)&KernelsList.back();
- DeviceInfo.addOffloadEntry(device_id, entry);
- }
+ CUstream Stream = getStream(DeviceId, AsyncInfo);
+ Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
+ /* gridDimZ */ 1, CudaThreadsPerBlock,
+ /* blockDimY */ 1, /* blockDimZ */ 1,
+ /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+ if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
+ return OFFLOAD_FAIL;
- // send device environment data to the device
- {
- omptarget_device_environmentTy device_env;
+ DP("Launch of entry point at " DPxMOD " successful!\n",
+ DPxPTR(TgtEntryPtr));
- device_env.debug_level = 0;
+ return OFFLOAD_SUCCESS;
+ }
-#ifdef OMPTARGET_DEBUG
- if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
- device_env.debug_level = std::stoi(envStr);
+ int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
+ CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
+ CUresult Err = cuStreamSynchronize(Stream);
+ if (Err != CUDA_SUCCESS) {
+ DP("Error when synchronizing stream. stream = " DPxMOD
+ ", async info ptr = " DPxMOD "\n",
+ DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
+ CUDA_ERR_STRING(Err);
+ return OFFLOAD_FAIL;
}
-#endif
- const char * device_env_Name="omptarget_device_environment";
- CUdeviceptr device_env_Ptr;
- size_t cusize;
+ // Once the stream is synchronized, return it to stream pool and reset
+ // async_info. This is to make sure the synchronization only works for its
+ // own tasks.
+ StreamManager->returnStream(
+ DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
+ AsyncInfoPtr->Queue = nullptr;
- err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
+ return OFFLOAD_SUCCESS;
+ }
+};
- if (err == CUDA_SUCCESS) {
- if ((size_t)cusize != sizeof(device_env)) {
- DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
- device_env_Name, cusize, sizeof(int32_t));
- CUDA_ERR_STRING(err);
- return NULL;
- }
+DeviceRTLTy DeviceRTL;
+} // namespace
- err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
- if (err != CUDA_SUCCESS) {
- DP("Error when copying data from host to device. Pointers: "
- "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
- DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
- CUDA_ERR_STRING(err);
- return NULL;
- }
+// Exposed library API function
+#ifdef __cplusplus
+extern "C" {
+#endif
- DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
- } else {
- DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
- DP("Continue, considering this is a device RTL which does not accept environment setting.\n");
- }
- }
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+ return elf_check_machine(image, /* EM_CUDA */ 190);
+}
+
+int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
- return DeviceInfo.getOffloadEntriesTable(device_id);
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+ DP("Init requires flags to %ld\n", RequiresFlags);
+ DeviceRTL.setRequiresFlag(RequiresFlags);
+ return RequiresFlags;
}
-void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
- if (size == 0) {
- return NULL;
- }
+int32_t __tgt_rtl_init_device(int32_t device_id) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
- // Set the context we are using.
- CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
- if (err != CUDA_SUCCESS) {
- DP("Error while trying to set CUDA current context\n");
- CUDA_ERR_STRING(err);
- return NULL;
- }
+ return DeviceRTL.initDevice(device_id);
+}
- CUdeviceptr ptr;
- err = cuMemAlloc(&ptr, size);
- if (err != CUDA_SUCCESS) {
- DP("Error while trying to allocate %d\n", err);
- CUDA_ERR_STRING(err);
- return NULL;
- }
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+ __tgt_device_image *image) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+ return DeviceRTL.loadBinary(device_id, image);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
- void *vptr = (void *)ptr;
- return vptr;
+ return DeviceRTL.dataAlloc(device_id, size);
}
int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
int64_t size) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
__tgt_async_info async_info;
- int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, size,
- &async_info);
+ const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
+ size, &async_info);
if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
@@ -856,15 +935,20 @@ int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
void *hst_ptr, int64_t size,
__tgt_async_info *async_info_ptr) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
assert(async_info_ptr && "async_info_ptr is nullptr");
- return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr);
+
+ return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
+ async_info_ptr);
}
int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
int64_t size) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
__tgt_async_info async_info;
- int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, size,
- &async_info);
+ const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
+ size, &async_info);
if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
@@ -874,26 +958,17 @@ int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
void *tgt_ptr, int64_t size,
__tgt_async_info *async_info_ptr) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
assert(async_info_ptr && "async_info_ptr is nullptr");
- return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr);
+
+ return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
+ async_info_ptr);
}
int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
- // Set the context we are using.
- CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
- if (err != CUDA_SUCCESS) {
- DP("Error when setting CUDA context\n");
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
- err = cuMemFree((CUdeviceptr)tgt_ptr);
- if (err != CUDA_SUCCESS) {
- DP("Error when freeing CUDA memory\n");
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
- return OFFLOAD_SUCCESS;
+ return DeviceRTL.dataDelete(device_id, tgt_ptr);
}
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
@@ -902,8 +977,10 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
int32_t arg_num, int32_t team_num,
int32_t thread_limit,
uint64_t loop_tripcount) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
__tgt_async_info async_info;
- int32_t rc = __tgt_rtl_run_target_team_region_async(
+ const int32_t rc = __tgt_rtl_run_target_team_region_async(
device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
thread_limit, loop_tripcount, &async_info);
if (rc != OFFLOAD_SUCCESS)
@@ -916,123 +993,21 @@ int32_t __tgt_rtl_run_target_team_region_async(
int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t team_num,
int32_t thread_limit, uint64_t loop_tripcount,
- __tgt_async_info *async_info) {
- // Set the context we are using.
- CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
- if (err != CUDA_SUCCESS) {
- DP("Error when setting CUDA context\n");
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
-
- // All args are references.
- std::vector<void *> args(arg_num);
- std::vector<void *> ptrs(arg_num);
-
- for (int32_t i = 0; i < arg_num; ++i) {
- ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
- args[i] = &ptrs[i];
- }
+ __tgt_async_info *async_info_ptr) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
- KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
-
- int cudaThreadsPerBlock;
-
- if (thread_limit > 0) {
- cudaThreadsPerBlock = thread_limit;
- DP("Setting CUDA threads per block to requested %d\n", thread_limit);
- // Add master warp if necessary
- if (KernelInfo->ExecutionMode == GENERIC) {
- cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
- DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
- }
- } else {
- cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
- DP("Setting CUDA threads per block to default %d\n",
- DeviceInfo.NumThreads[device_id]);
- }
-
- if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
- cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
- DP("Threads per block capped at device limit %d\n",
- DeviceInfo.ThreadsPerBlock[device_id]);
- }
-
- int kernel_limit;
- err = cuFuncGetAttribute(&kernel_limit,
- CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
- if (err == CUDA_SUCCESS) {
- if (kernel_limit < cudaThreadsPerBlock) {
- cudaThreadsPerBlock = kernel_limit;
- DP("Threads per block capped at kernel limit %d\n", kernel_limit);
- }
- }
-
- int cudaBlocksPerGrid;
- if (team_num <= 0) {
- if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
- if (KernelInfo->ExecutionMode == SPMD) {
- // We have a combined construct, i.e. `target teams distribute parallel
- // for [simd]`. We launch so many teams so that each thread will
- // execute one iteration of the loop.
- // round up to the nearest integer
- cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
- } else {
- // If we reach this point, then we have a non-combined construct, i.e.
- // `teams distribute` with a nested `parallel for` and each team is
- // assigned one iteration of the `distribute` loop. E.g.:
- //
- // #pragma omp target teams distribute
- // for(...loop_tripcount...) {
- // #pragma omp parallel for
- // for(...) {}
- // }
- //
- // Threads within a team will execute the iterations of the `parallel`
- // loop.
- cudaBlocksPerGrid = loop_tripcount;
- }
- DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
- "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
- cudaThreadsPerBlock);
- } else {
- cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
- DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
- }
- } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
- cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
- DP("Capping number of teams to team limit %d\n",
- DeviceInfo.BlocksPerGrid[device_id]);
- } else {
- cudaBlocksPerGrid = team_num;
- DP("Using requested number of teams %d\n", team_num);
- }
-
- // Run on the device.
- DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
- cudaThreadsPerBlock);
-
- CUstream Stream = getStream(device_id, async_info);
- err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
- cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/,
- Stream, &args[0], 0);
- if (err != CUDA_SUCCESS) {
- DP("Device kernel launch failed!\n");
- CUDA_ERR_STRING(err);
- return OFFLOAD_FAIL;
- }
-
- DP("Launch of entry point at " DPxMOD " successful!\n",
- DPxPTR(tgt_entry_ptr));
-
- return OFFLOAD_SUCCESS;
+ return DeviceRTL.runTargetTeamRegion(
+ device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
+ thread_limit, loop_tripcount, async_info_ptr);
}
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args, ptr
diff _t *tgt_offsets,
int32_t arg_num) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
__tgt_async_info async_info;
- int32_t rc = __tgt_rtl_run_target_region_async(
+ const int32_t rc = __tgt_rtl_run_target_region_async(
device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
@@ -1044,35 +1019,22 @@ int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
void *tgt_entry_ptr, void **tgt_args,
ptr
diff _t *tgt_offsets,
int32_t arg_num,
- __tgt_async_info *async_info) {
- // use one team and the default number of threads.
- const int32_t team_num = 1;
- const int32_t thread_limit = 0;
+ __tgt_async_info *async_info_ptr) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
return __tgt_rtl_run_target_team_region_async(
- device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
- thread_limit, 0, async_info);
+ device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
+ /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
+ async_info_ptr);
}
-int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) {
- assert(async_info && "async_info is nullptr");
- assert(async_info->Queue && "async_info->Queue is nullptr");
-
- CUstream Stream = reinterpret_cast<CUstream>(async_info->Queue);
- CUresult Err = cuStreamSynchronize(Stream);
- if (Err != CUDA_SUCCESS) {
- DP("Error when synchronizing stream. stream = " DPxMOD
- ", async info ptr = " DPxMOD "\n",
- DPxPTR(Stream), DPxPTR(async_info));
- CUDA_ERR_STRING(Err);
- return OFFLOAD_FAIL;
- }
-
- // Once the stream is synchronized, return it to stream pool and reset
- // async_info. This is to make sure the synchronization only works for its own
- // tasks.
- DeviceInfo.returnStream(device_id, async_info);
+int32_t __tgt_rtl_synchronize(int32_t device_id,
+ __tgt_async_info *async_info_ptr) {
+ assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+ assert(async_info_ptr && "async_info_ptr is nullptr");
+ assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
- return OFFLOAD_SUCCESS;
+ return DeviceRTL.synchronize(device_id, async_info_ptr);
}
#ifdef __cplusplus
More information about the Openmp-commits
mailing list