[Openmp-commits] [PATCH] D32321: [OpenMP] Optimized default kernel launch parameters in CUDA plugin
George Rokos via Phabricator via Openmp-commits
openmp-commits at lists.llvm.org
Tue Apr 25 08:10:58 PDT 2017
grokos updated this revision to Diff 96567.
grokos marked an inline comment as done.
grokos added a comment.
Wrote inline comments to make clear what the new default launch configuration is about.
Repository:
rL LLVM
https://reviews.llvm.org/D32321
Files:
libomptarget/plugins/cuda/src/rtl.cpp
Index: libomptarget/plugins/cuda/src/rtl.cpp
===================================================================
--- libomptarget/plugins/cuda/src/rtl.cpp
+++ libomptarget/plugins/cuda/src/rtl.cpp
@@ -51,8 +51,9 @@
};
enum ExecutionModeType {
- SPMD,
- GENERIC,
+ SPMD, // constructors, destructors,
+ // combined constructs (`teams distribute parallel for [simd]`)
+ GENERIC, // everything else
NONE
};
@@ -99,7 +100,7 @@
static const int HardTeamLimit = 1<<16; // 64k
static const int HardThreadLimit = 1024;
static const int DefaultNumTeams = 128;
- static const int DefaultNumThreads = 1024;
+ static const int DefaultNumThreads = 128;
// Record entry point associated with device
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
@@ -581,18 +582,17 @@
if (thread_limit > 0) {
cudaThreadsPerBlock = thread_limit;
DP("Setting CUDA threads per block to requested %d\n", thread_limit);
+ // Add master warp if necessary
+ if (KernelInfo->ExecutionMode == GENERIC) {
+ cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
+ DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
+ }
} else {
cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
DP("Setting CUDA threads per block to default %d\n",
DeviceInfo.NumThreads[device_id]);
}
- // Add master warp if necessary
- if (KernelInfo->ExecutionMode == GENERIC) {
- cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
- DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
- }
-
if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
DP("Threads per block capped at device limit %d\n",
@@ -612,8 +612,27 @@
int cudaBlocksPerGrid;
if (team_num <= 0) {
if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
- // round up to the nearest integer
- cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
+ if (KernelInfo->ExecutionMode == SPMD) {
+ // We have a combined construct, i.e. `target teams distribute parallel
+ // for [simd]`. We launch so many teams so that each thread will
+ // execute one iteration of the loop.
+ // round up to the nearest integer
+ cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
+ } else {
+ // If we reach this point, then we have a non-combined construct, i.e.
+ // `teams distribute` with a nested `parallel for` and each team is
+ // assigned one iteration of the `distribute` loop. E.g.:
+ //
+ // #pragma omp target teams distribute
+ // for(...loop_tripcount...) {
+ // #pragma omp parallel for
+ // for(...) {}
+ // }
+ //
+ // Threads within a team will execute the iterations of the `parallel`
+ // loop.
+ cudaBlocksPerGrid = loop_tripcount;
+ }
DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
"threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
cudaThreadsPerBlock);
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D32321.96567.patch
Type: text/x-patch
Size: 3185 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/openmp-commits/attachments/20170425/d23170f4/attachment.bin>
More information about the Openmp-commits
mailing list