[clang] 0870a4f - [OpenMP] Add flag for disabling thread state in runtime

Joseph Huber via cfe-commits cfe-commits at lists.llvm.org
Fri Feb 18 05:35:19 PST 2022


Author: Joseph Huber
Date: 2022-02-18T08:35:05-05:00
New Revision: 0870a4f59aef21bf7707b00ebd4dcad7ce7ef807

URL: https://github.com/llvm/llvm-project/commit/0870a4f59aef21bf7707b00ebd4dcad7ce7ef807
DIFF: https://github.com/llvm/llvm-project/commit/0870a4f59aef21bf7707b00ebd4dcad7ce7ef807.diff

LOG: [OpenMP] Add flag for disabling thread state in runtime

The runtime uses thread state values to indicate when we use an ICV or
are in nested parallelism. This is done for OpenMP correctness, but it
not needed in the majority of cases. The new flag added is
`-fopenmp-assume-no-thread-state`.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D120106

Added: 
    

Modified: 
    clang/include/clang/Basic/LangOptions.def
    clang/include/clang/Driver/Options.td
    clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/test/OpenMP/target_globals_codegen.cpp
    openmp/libomptarget/DeviceRTL/include/Configuration.h
    openmp/libomptarget/DeviceRTL/src/Configuration.cpp
    openmp/libomptarget/DeviceRTL/src/State.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 4651f4fff6aa0..e21998860f217 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -246,6 +246,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
 LANGOPT(OpenMPOptimisticCollapse  , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
 LANGOPT(OpenMPThreadSubscription  , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
 LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
+LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
 LANGOPT(RenderScript      , 1, 0, "RenderScript")
 
 LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 76cfdbcd85f26..c377329e8f6f4 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2473,6 +2473,10 @@ def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-te
   Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
 def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">,
   Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
+def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state">, Group<f_Group>, 
+  Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, 
+  HelpText<"Assert no thread in a parallel region modifies an ICV">,
+  MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>;
 defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime",
   LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue,
   PosFlag<SetTrue, [CC1Option], "Use the new bitcode library for OpenMP offloading">,

diff  --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index bb6847ab87319..fcaf9d4ed77b3 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1210,6 +1210,8 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
                                 "__omp_rtl_assume_teams_oversubscription");
     OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
                                 "__omp_rtl_assume_threads_oversubscription");
+    OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
+                                "__omp_rtl_assume_no_thread_state");
   }
 }
 

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a16175ebebbca..32cbb7936f7ee 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5995,6 +5995,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                        options::OPT_fno_openmp_assume_threads_oversubscription,
                        /*Default=*/false))
         CmdArgs.push_back("-fopenmp-assume-threads-oversubscription");
+      if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state))
+        CmdArgs.push_back("-fopenmp-assume-no-thread-state");
       break;
     default:
       // By default, if Clang doesn't know how to generate useful OpenMP code

diff  --git a/clang/test/OpenMP/target_globals_codegen.cpp b/clang/test/OpenMP/target_globals_codegen.cpp
index fa7569cd4ca6b..3c5d4b8ed3984 100644
--- a/clang/test/OpenMP/target_globals_codegen.cpp
+++ b/clang/test/OpenMP/target_globals_codegen.cpp
@@ -6,6 +6,7 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
 // expected-no-diagnostics
 
@@ -16,26 +17,37 @@
 // CHECK: @__omp_rtl_debug_kind = weak_odr hidden constant i32 1
 // CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
 //.
 // CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111
 // CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
 //.
 // CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
 //.
 // CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1
+// CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
 //.
 // CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
 // CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+//.
+// CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
+// CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
+// CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1
 //.
 // CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
 //.
 void foo() {
 #pragma omp target

diff  --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
index 5727f1f2bfbf6..94f11b6066a20 100644
--- a/openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -38,8 +38,13 @@ uint32_t getDebugKind();
 /// Return the amount of dynamic shared memory that was allocated at launch.
 uint64_t getDynamicMemorySize();
 
+/// Return if debugging is enabled for the given debug kind.
 bool isDebugMode(DebugKind Level);
 
+/// Indicates if this kernel may require thread-specific states, or if it was
+/// explicitly disabled by the user.
+bool mayUseThreadStates();
+
 } // namespace config
 } // namespace _OMP
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index 349f93a08701c..e9cc9bb0e318e 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -20,7 +20,9 @@ using namespace _OMP;
 
 #pragma omp declare target
 
-extern uint32_t __omp_rtl_debug_kind; // defined by CGOpenMPRuntimeGPU
+// defined by CGOpenMPRuntimeGPU
+extern uint32_t __omp_rtl_debug_kind;
+extern uint32_t __omp_rtl_assume_no_thread_state;
 
 // TODO: We want to change the name as soon as the old runtime is gone.
 // This variable should be visibile to the plugin so we override the default
@@ -48,4 +50,6 @@ bool config::isDebugMode(config::DebugKind Kind) {
   return config::getDebugKind() & Kind;
 }
 
+bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
+
 #pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index a04f5cccb1738..a530c5e0b2471 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -285,7 +285,8 @@ ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
 
 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) {
-  if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0))
+  if (OMP_LIKELY(!config::mayUseThreadStates() ||
+                 TeamState.ICVState.LevelVar == 0))
     return TeamState.ICVState.*Var;
   uint32_t TId = mapping::getThreadIdInBlock();
   if (!ThreadStates[TId]) {
@@ -299,13 +300,13 @@ uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) {
 
 uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
   uint32_t TId = mapping::getThreadIdInBlock();
-  if (OMP_UNLIKELY(ThreadStates[TId]))
+  if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId]))
     return ThreadStates[TId]->ICVState.*Var;
   return TeamState.ICVState.*Var;
 }
 uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
   uint64_t TId = mapping::getThreadIdInBlock();
-  if (OMP_UNLIKELY(ThreadStates[TId]))
+  if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId]))
     return ThreadStates[TId]->ICVState.*Var;
   return TeamState.ICVState.*Var;
 }
@@ -380,6 +381,9 @@ void state::init(bool IsSPMD) {
 }
 
 void state::enterDataEnvironment(IdentTy *Ident) {
+  ASSERT(config::mayUseThreadStates() &&
+         "Thread state modified while explicitly disabled!");
+
   unsigned TId = mapping::getThreadIdInBlock();
   ThreadStateTy *NewThreadState =
       static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
@@ -388,6 +392,9 @@ void state::enterDataEnvironment(IdentTy *Ident) {
 }
 
 void state::exitDataEnvironment() {
+  ASSERT(config::mayUseThreadStates() &&
+         "Thread state modified while explicitly disabled!");
+
   unsigned TId = mapping::getThreadIdInBlock();
   resetStateForThread(TId);
 }


        


More information about the cfe-commits mailing list