[Openmp-commits] [openmp] 7d57639 - [OpenMP] Add new execution mode for SPMD execution with Generic semantics
via Openmp-commits
openmp-commits at lists.llvm.org
Wed Jul 21 17:57:39 PDT 2021
Author: Joseph Huber
Date: 2021-07-21T20:57:28-04:00
New Revision: 7d576392644d44a765336f6ebefd45f5db61231e
URL: https://github.com/llvm/llvm-project/commit/7d576392644d44a765336f6ebefd45f5db61231e
DIFF: https://github.com/llvm/llvm-project/commit/7d576392644d44a765336f6ebefd45f5db61231e.diff
LOG: [OpenMP] Add new execution mode for SPMD execution with Generic semantics
Qualified kernels can be transformed from generic-mode to SPMD mode using an
optimization in OpenMPOpt. This patch introduces a new execution mode to
indicate kernels that have been transformed from generic-mode to SPMD-mode.
These kernels have SPMD-mode execution, but need generic-mode semantics for
scheduling the blocks and threads. Without this far too few blocks will be
scheduled for a generic region as SPMD mode expects the trip count to be
divided by the number of threads.
Reviewed By: ggeorgakoudis
Differential Revision: https://reviews.llvm.org/D106460
Added:
Modified:
llvm/lib/Transforms/IPO/OpenMPOpt.cpp
llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
llvm/test/Transforms/OpenMP/spmdization.ll
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
openmp/libomptarget/plugins/cuda/src/rtl.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index c2c212960fd6b..44aa249ad08c1 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2886,8 +2886,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
assert(ExecMode->getInitializer() &&
ExecMode->getInitializer()->isOneValue() &&
"Initially non-SPMD kernel has SPMD exec mode!");
- ExecMode->setInitializer(
- ConstantInt::get(ExecMode->getInitializer()->getType(), 0));
+
+ // Set the global exec mode flag to indicate SPMD-Generic mode.
+ constexpr int SPMDGeneric = 2;
+ if (!ExecMode->getInitializer()->isZeroValue())
+ ExecMode->setInitializer(
+ ConstantInt::get(ExecMode->getInitializer()->getType(), SPMDGeneric));
// Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
const int InitIsSPMDArgNo = 1;
diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
index 9ff9d68c5f35a..9144cd32ef4f7 100644
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -13,7 +13,7 @@ target triple = "nvptx64"
;.
; CHECK: @[[IS_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
-; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
+; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
; CHECK: @[[NON_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
; CHECK: @[[WILL_NOT_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 7dedbc851cfed..c19ecbfe99d65 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -32,7 +32,7 @@ target triple = "nvptx64"
;.
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
-; CHECK: @[[__OMP_OFFLOADING_2C_38C77_SEQUENTIAL_LOOP_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
+; CHECK: @[[__OMP_OFFLOADING_2C_38C77_SEQUENTIAL_LOOP_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata"
;.
define weak void @__omp_offloading_2c_38c77_sequential_loop_l4() #0 {
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 21901365b7714..843e2a11928e5 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -127,9 +127,10 @@ struct FuncOrGblEntryTy {
};
enum ExecutionModeType {
- SPMD, // constructors, destructors,
- // combined constructs (`teams distribute parallel for [simd]`)
- GENERIC, // everything else
+ SPMD, // constructors, destructors,
+ // combined constructs (`teams distribute parallel for [simd]`)
+ GENERIC, // everything else
+ SPMD_GENERIC, // Generic kernel with SPMD execution
NONE
};
@@ -240,6 +241,7 @@ struct KernelTy {
// execution mode of kernel
// 0 - SPMD mode (without master warp)
// 1 - Generic mode (with master warp)
+ // 2 - SPMD mode execution with Generic mode semantics.
int8_t ExecutionMode;
int16_t ConstWGSize;
int32_t device_id;
@@ -1730,7 +1732,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
DP("After loading global for %s ExecMode = %d\n", ExecModeName,
ExecModeVal);
- if (ExecModeVal < 0 || ExecModeVal > 1) {
+ if (ExecModeVal < 0 || ExecModeVal > 2) {
DP("Error wrong exec_mode value specified in HSA code object file: "
"%d\n",
ExecModeVal);
@@ -1965,7 +1967,11 @@ launchVals getLaunchVals(EnvironmentVariables Env, int ConstWGSize,
if (ExecutionMode == SPMD) {
// round up to the nearest integer
num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
- } else {
+ } else if (ExecutionMode == GENERIC) {
+ num_groups = loop_tripcount;
+ } else if (ExecutionMode == SPMD_GENERIC) {
+ // This is a generic kernel that was transformed to use SPMD-mode
+ // execution but uses Generic-mode semantics for scheduling.
num_groups = loop_tripcount;
}
DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index 7b04bc9180e5b..fff33fe6b5fff 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -70,9 +70,10 @@ struct FuncOrGblEntryTy {
};
enum ExecutionModeType {
- SPMD, // constructors, destructors,
- // combined constructs (`teams distribute parallel for [simd]`)
- GENERIC, // everything else
+ SPMD, // constructors, destructors,
+ // combined constructs (`teams distribute parallel for [simd]`)
+ GENERIC, // everything else
+ SPMD_GENERIC, // Generic kernel with SPMD execution
NONE
};
@@ -83,6 +84,7 @@ struct KernelTy {
// execution mode of kernel
// 0 - SPMD mode (without master warp)
// 1 - Generic mode (with master warp)
+ // 2 - SPMD mode execution with Generic mode semantics.
int8_t ExecutionMode;
/// Maximal number of threads per block for this kernel.
@@ -796,7 +798,7 @@ class DeviceRTLTy {
return nullptr;
}
- if (ExecModeVal < 0 || ExecModeVal > 1) {
+ if (ExecModeVal < 0 || ExecModeVal > 2) {
DP("Error wrong exec_mode value specified in cubin file: %d\n",
ExecModeVal);
return nullptr;
@@ -1045,7 +1047,7 @@ class DeviceRTLTy {
// will execute one iteration of the loop. round up to the nearest
// integer
CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
- } else {
+ } else if (KernelInfo->ExecutionMode == GENERIC) {
// If we reach this point, then we have a non-combined construct, i.e.
// `teams distribute` with a nested `parallel for` and each team is
// assigned one iteration of the `distribute` loop. E.g.:
@@ -1059,6 +1061,14 @@ class DeviceRTLTy {
// Threads within a team will execute the iterations of the `parallel`
// loop.
CudaBlocksPerGrid = LoopTripCount;
+ } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) {
+ // If we reach this point, then we are executing a kernel that was
+ // transformed from Generic-mode to SPMD-mode. This kernel has
+ // SPMD-mode execution, but needs its blocks to be scheduled
+ //
diff erently because the current loop trip count only applies to the
+ // `teams distribute` region and will create var too few blocks using
+ // the regular SPMD-mode method.
+ CudaBlocksPerGrid = LoopTripCount;
}
DP("Using %d teams due to loop trip count %" PRIu32
" and number of threads per block %d\n",
@@ -1083,7 +1093,9 @@ class DeviceRTLTy {
? getOffloadEntry(DeviceId, TgtEntryPtr)->name
: "(null)",
CudaBlocksPerGrid, CudaThreadsPerBlock,
- (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
+ (KernelInfo->ExecutionMode != SPMD
+ ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" : "SPMD-Generic")
+ : "SPMD"));
CUstream Stream = getStream(DeviceId, AsyncInfo);
Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
More information about the Openmp-commits
mailing list