[clang] 2b8f722 - [OpenMP] Add option to assert no nested OpenMP parallelism on the GPU
Joseph Huber via cfe-commits
cfe-commits at lists.llvm.org
Tue Aug 23 12:10:23 PDT 2022
Author: Joseph Huber
Date: 2022-08-23T14:09:51-05:00
New Revision: 2b8f722e630d0fdf1ca267361866a27c8d4c9387
URL: https://github.com/llvm/llvm-project/commit/2b8f722e630d0fdf1ca267361866a27c8d4c9387
DIFF: https://github.com/llvm/llvm-project/commit/2b8f722e630d0fdf1ca267361866a27c8d4c9387.diff
LOG: [OpenMP] Add option to assert no nested OpenMP parallelism on the GPU
The OpenMP device runtime needs to support the OpenMP standard. However
constructs like nested parallelism are very uncommon in real application
yet lead to complexity in the runtime that is sometimes difficult to
optimize out. As a stop-gap for performance we should supply an argument
that selectively disables this feature. This patch adds the
`-fopenmp-assume-no-nested-parallelism` argument which explicitly
disables the usee of nested parallelism in OpenMP.
Reviewed By: carlo.bertolli
Differential Revision: https://reviews.llvm.org/D132074
Added:
Modified:
clang/include/clang/Basic/LangOptions.def
clang/include/clang/Driver/Options.td
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/test/OpenMP/target_globals_codegen.cpp
openmp/libomptarget/DeviceRTL/include/Configuration.h
openmp/libomptarget/DeviceRTL/src/Configuration.cpp
openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 4f3f6fc9da8c0..530f82268eb09 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -254,6 +254,7 @@ LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
+LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
LANGOPT(NoGPULib , 1, 0, "Indicate a build without the standard GPU libraries.")
LANGOPT(RenderScript , 1, 0, "RenderScript")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0666768c7e74b..76aed6a7c928f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2585,6 +2585,10 @@ def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
HelpText<"Assert no thread in a parallel region modifies an ICV">,
MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>;
+def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested-parallelism">, Group<f_Group>,
+ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
+ HelpText<"Assert no nested parallel regions in the GPU">,
+ MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<f_Group>,
Flags<[CC1Option, NoArgumentUnused]>,
HelpText<"Do not create a host fallback if offloading to the device fails.">,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index e8affaaecc980..54e9c4d844a3c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1213,6 +1213,8 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
"__omp_rtl_assume_threads_oversubscription");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
"__omp_rtl_assume_no_thread_state");
+ OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
+ "__omp_rtl_assume_no_nested_parallelism");
}
void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b867adc553033..34bf7a797e7e8 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6128,6 +6128,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fopenmp-assume-threads-oversubscription");
if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state))
CmdArgs.push_back("-fopenmp-assume-no-thread-state");
+ if (Args.hasArg(options::OPT_fopenmp_assume_no_nested_parallelism))
+ CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
CmdArgs.push_back("-fopenmp-offload-mandatory");
break;
@@ -8426,8 +8428,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
for (StringRef LibName : BCLibs)
CmdArgs.push_back(Args.MakeArgString(
- "--bitcode-library=" + Action::GetOffloadKindName(Action::OFK_OpenMP) +
- "-" + TC->getTripleString() + "-" + Arch + "=" + LibName));
+ "--bitcode-library=" +
+ Action::GetOffloadKindName(Action::OFK_OpenMP) + "-" +
+ TC->getTripleString() + "-" + Arch + "=" + LibName));
}
if (D.isUsingLTO(/* IsOffload */ true)) {
diff --git a/clang/test/OpenMP/target_globals_codegen.cpp b/clang/test/OpenMP/target_globals_codegen.cpp
index ad0c097e09103..45baa7b5d0445 100644
--- a/clang/test/OpenMP/target_globals_codegen.cpp
+++ b/clang/test/OpenMP/target_globals_codegen.cpp
@@ -7,6 +7,7 @@
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-nested-parallelism -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-NESTED
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -nogpulib -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
// expected-no-diagnostics
@@ -19,36 +20,49 @@
// CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
// CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111
// CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-EQ: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
// CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-DEFAULT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
// CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1
// CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-THREADS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
// CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
// CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-TEAMS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
// CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
// CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1
+// CHECK-STATE: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
+//.
+// CHECK-NESTED: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 1
//.
// CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
// CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
// CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
//.
void foo() {
#pragma omp target
diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
index 368a7c35ac4a8..72514b82896f9 100644
--- a/openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -44,6 +44,10 @@ bool isDebugMode(DebugKind Level);
/// explicitly disabled by the user.
bool mayUseThreadStates();
+/// Indicates if this kernel may require data environments for nested
+/// parallelism, or if it was explicitly disabled by the user.
+bool mayUseNestedParallelism();
+
} // namespace config
} // namespace _OMP
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index b3d779a96361f..a41574cfc21d6 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -23,6 +23,7 @@ using namespace _OMP;
// defined by CGOpenMPRuntimeGPU
extern uint32_t __omp_rtl_debug_kind;
extern uint32_t __omp_rtl_assume_no_thread_state;
+extern uint32_t __omp_rtl_assume_no_nested_parallelism;
// TODO: We want to change the name as soon as the old runtime is gone.
// This variable should be visibile to the plugin so we override the default
@@ -52,4 +53,8 @@ bool config::isDebugMode(config::DebugKind Kind) {
bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
+bool config::mayUseNestedParallelism() {
+ return !__omp_rtl_assume_no_nested_parallelism;
+}
+
#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index 27d1ff2e5a55c..5ebf3687ac43f 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -86,11 +86,16 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
uint32_t TId = mapping::getThreadIdInBlock();
+ // Assert the parallelism level is zero if disabled by the user.
+ ASSERT((config::mayUseNestedParallelism() || icv::Level == 0) &&
+ "nested parallelism while disabled");
+
// Handle the serialized case first, same for SPMD/non-SPMD:
// 1) if-clause(0)
- // 2) nested parallel regions
- // 3) parallel in task or other thread state inducing construct
- if (OMP_UNLIKELY(!if_expr || icv::Level || state::HasThreadState)) {
+ // 2) parallel in task or other thread state inducing construct
+ // 3) nested parallel regions
+ if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
+ (config::mayUseNestedParallelism() && icv::Level))) {
state::DateEnvironmentRAII DERAII(ident);
++icv::Level;
invokeMicrotask(TId, 0, fn, args, nargs);
More information about the cfe-commits
mailing list