[Openmp-commits] [openmp] r301321 - [OpenMP] Optimized default kernel launch parameters in CUDA plugin
George Rokos via Openmp-commits
openmp-commits at lists.llvm.org
Tue Apr 25 09:34:13 PDT 2017
Author: grokos
Date: Tue Apr 25 11:34:13 2017
New Revision: 301321
URL: http://llvm.org/viewvc/llvm-project?rev=301321&view=rev
Log:
[OpenMP] Optimized default kernel launch parameters in CUDA plugin
Differential Revision: https://reviews.llvm.org/D32321
Modified:
openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp
Modified: openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp?rev=301321&r1=301320&r2=301321&view=diff
==============================================================================
--- openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp (original)
+++ openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp Tue Apr 25 11:34:13 2017
@@ -51,8 +51,9 @@ struct FuncOrGblEntryTy {
};
enum ExecutionModeType {
- SPMD,
- GENERIC,
+ SPMD, // constructors, destructors,
+ // combined constructs (`teams distribute parallel for [simd]`)
+ GENERIC, // everything else
NONE
};
@@ -99,7 +100,7 @@ public:
static const int HardTeamLimit = 1<<16; // 64k
static const int HardThreadLimit = 1024;
static const int DefaultNumTeams = 128;
- static const int DefaultNumThreads = 1024;
+ static const int DefaultNumThreads = 128;
// Record entry point associated with device
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
@@ -581,18 +582,17 @@ int32_t __tgt_rtl_run_target_team_region
if (thread_limit > 0) {
cudaThreadsPerBlock = thread_limit;
DP("Setting CUDA threads per block to requested %d\n", thread_limit);
+ // Add master warp if necessary
+ if (KernelInfo->ExecutionMode == GENERIC) {
+ cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
+ DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
+ }
} else {
cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
DP("Setting CUDA threads per block to default %d\n",
DeviceInfo.NumThreads[device_id]);
}
- // Add master warp if necessary
- if (KernelInfo->ExecutionMode == GENERIC) {
- cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
- DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
- }
-
if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
DP("Threads per block capped at device limit %d\n",
@@ -612,8 +612,27 @@ int32_t __tgt_rtl_run_target_team_region
int cudaBlocksPerGrid;
if (team_num <= 0) {
if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
- // round up to the nearest integer
- cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
+ if (KernelInfo->ExecutionMode == SPMD) {
+ // We have a combined construct, i.e. `target teams distribute parallel
+ // for [simd]`. We launch so many teams so that each thread will
+ // execute one iteration of the loop.
+ // round up to the nearest integer
+ cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
+ } else {
+ // If we reach this point, then we have a non-combined construct, i.e.
+ // `teams distribute` with a nested `parallel for` and each team is
+ // assigned one iteration of the `distribute` loop. E.g.:
+ //
+ // #pragma omp target teams distribute
+ // for(...loop_tripcount...) {
+ // #pragma omp parallel for
+ // for(...) {}
+ // }
+ //
+ // Threads within a team will execute the iterations of the `parallel`
+ // loop.
+ cudaBlocksPerGrid = loop_tripcount;
+ }
DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
"threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
cudaThreadsPerBlock);
More information about the Openmp-commits
mailing list