[Openmp-commits] [openmp] 2b8f722 - [OpenMP] Add option to assert no nested OpenMP parallelism on the GPU

Joseph Huber via Openmp-commits openmp-commits at lists.llvm.org
Tue Aug 23 12:10:24 PDT 2022


Author: Joseph Huber
Date: 2022-08-23T14:09:51-05:00
New Revision: 2b8f722e630d0fdf1ca267361866a27c8d4c9387

URL: https://github.com/llvm/llvm-project/commit/2b8f722e630d0fdf1ca267361866a27c8d4c9387
DIFF: https://github.com/llvm/llvm-project/commit/2b8f722e630d0fdf1ca267361866a27c8d4c9387.diff

LOG: [OpenMP] Add option to assert no nested OpenMP parallelism on the GPU

The OpenMP device runtime needs to support the OpenMP standard. However
constructs like nested parallelism are very uncommon in real application
yet lead to complexity in the runtime that is sometimes difficult to
optimize out. As a stop-gap for performance we should supply an argument
that selectively disables this feature. This patch adds the
`-fopenmp-assume-no-nested-parallelism` argument which explicitly
disables the usee of nested parallelism in OpenMP.

Reviewed By: carlo.bertolli

Differential Revision: https://reviews.llvm.org/D132074

Added: 
    

Modified: 
    clang/include/clang/Basic/LangOptions.def
    clang/include/clang/Driver/Options.td
    clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/test/OpenMP/target_globals_codegen.cpp
    openmp/libomptarget/DeviceRTL/include/Configuration.h
    openmp/libomptarget/DeviceRTL/src/Configuration.cpp
    openmp/libomptarget/DeviceRTL/src/Parallelism.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 4f3f6fc9da8c0..530f82268eb09 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -254,6 +254,7 @@ LANGOPT(OpenMPOptimisticCollapse  , 1, 0, "Use at most 32 bits to represent the
 LANGOPT(OpenMPThreadSubscription  , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
 LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
 LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
+LANGOPT(OpenMPNoNestedParallelism  , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
 LANGOPT(OpenMPOffloadMandatory  , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
 LANGOPT(NoGPULib  , 1, 0, "Indicate a build without the standard GPU libraries.")
 LANGOPT(RenderScript      , 1, 0, "RenderScript")

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0666768c7e74b..76aed6a7c928f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2585,6 +2585,10 @@ def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state
   Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
   HelpText<"Assert no thread in a parallel region modifies an ICV">,
   MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>;
+def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested-parallelism">, Group<f_Group>,
+  Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
+  HelpText<"Assert no nested parallel regions in the GPU">,
+  MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
 def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<f_Group>,
   Flags<[CC1Option, NoArgumentUnused]>,
   HelpText<"Do not create a host fallback if offloading to the device fails.">,

diff  --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index e8affaaecc980..54e9c4d844a3c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1213,6 +1213,8 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
                               "__omp_rtl_assume_threads_oversubscription");
   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
                               "__omp_rtl_assume_no_thread_state");
+  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
+                              "__omp_rtl_assume_no_nested_parallelism");
 }
 
 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b867adc553033..34bf7a797e7e8 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6128,6 +6128,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back("-fopenmp-assume-threads-oversubscription");
       if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state))
         CmdArgs.push_back("-fopenmp-assume-no-thread-state");
+      if (Args.hasArg(options::OPT_fopenmp_assume_no_nested_parallelism))
+        CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
       if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
         CmdArgs.push_back("-fopenmp-offload-mandatory");
       break;
@@ -8426,8 +8428,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
 
     for (StringRef LibName : BCLibs)
       CmdArgs.push_back(Args.MakeArgString(
-          "--bitcode-library=" + Action::GetOffloadKindName(Action::OFK_OpenMP) +
-          "-" + TC->getTripleString() + "-" + Arch + "=" + LibName));
+          "--bitcode-library=" +
+          Action::GetOffloadKindName(Action::OFK_OpenMP) + "-" +
+          TC->getTripleString() + "-" + Arch + "=" + LibName));
   }
 
   if (D.isUsingLTO(/* IsOffload */ true)) {

diff  --git a/clang/test/OpenMP/target_globals_codegen.cpp b/clang/test/OpenMP/target_globals_codegen.cpp
index ad0c097e09103..45baa7b5d0445 100644
--- a/clang/test/OpenMP/target_globals_codegen.cpp
+++ b/clang/test/OpenMP/target_globals_codegen.cpp
@@ -7,6 +7,7 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-nested-parallelism -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-NESTED
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -nogpulib -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
 // expected-no-diagnostics
@@ -19,36 +20,49 @@
 // CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
 // CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
 //.
 // CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111
 // CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
 // CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-EQ: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
 //.
 // CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
 // CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-DEFAULT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
 //.
 // CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1
 // CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-THREADS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
 //.
 // CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
 // CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
 // CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-TEAMS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
 //.
 // CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
 // CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1
+// CHECK-STATE: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
+//.
+// CHECK-NESTED: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-NESTED: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 1
 //.
 // CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
 //.
 void foo() {
 #pragma omp target

diff  --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
index 368a7c35ac4a8..72514b82896f9 100644
--- a/openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -44,6 +44,10 @@ bool isDebugMode(DebugKind Level);
 /// explicitly disabled by the user.
 bool mayUseThreadStates();
 
+/// Indicates if this kernel may require data environments for nested
+/// parallelism, or if it was explicitly disabled by the user.
+bool mayUseNestedParallelism();
+
 } // namespace config
 } // namespace _OMP
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index b3d779a96361f..a41574cfc21d6 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -23,6 +23,7 @@ using namespace _OMP;
 // defined by CGOpenMPRuntimeGPU
 extern uint32_t __omp_rtl_debug_kind;
 extern uint32_t __omp_rtl_assume_no_thread_state;
+extern uint32_t __omp_rtl_assume_no_nested_parallelism;
 
 // TODO: We want to change the name as soon as the old runtime is gone.
 // This variable should be visibile to the plugin so we override the default
@@ -52,4 +53,8 @@ bool config::isDebugMode(config::DebugKind Kind) {
 
 bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
 
+bool config::mayUseNestedParallelism() {
+  return !__omp_rtl_assume_no_nested_parallelism;
+}
+
 #pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index 27d1ff2e5a55c..5ebf3687ac43f 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -86,11 +86,16 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
 
   uint32_t TId = mapping::getThreadIdInBlock();
 
+  // Assert the parallelism level is zero if disabled by the user.
+  ASSERT((config::mayUseNestedParallelism() || icv::Level == 0) &&
+         "nested parallelism while disabled");
+
   // Handle the serialized case first, same for SPMD/non-SPMD:
   // 1) if-clause(0)
-  // 2) nested parallel regions
-  // 3) parallel in task or other thread state inducing construct
-  if (OMP_UNLIKELY(!if_expr || icv::Level || state::HasThreadState)) {
+  // 2) parallel in task or other thread state inducing construct
+  // 3) nested parallel regions
+  if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
+                   (config::mayUseNestedParallelism() && icv::Level))) {
     state::DateEnvironmentRAII DERAII(ident);
     ++icv::Level;
     invokeMicrotask(TId, 0, fn, args, nargs);


        


More information about the Openmp-commits mailing list