[Openmp-commits] [openmp] f914208 - [OpenMP][NFCI] Avoid storing non-constant values in ICV

Johannes Doerfert via Openmp-commits openmp-commits at lists.llvm.org
Tue Jul 18 16:57:24 PDT 2023


Author: Johannes Doerfert
Date: 2023-07-18T16:50:50-07:00
New Revision: f914208c4388780e24ed3a1ab2e170c53129b2a1

URL: https://github.com/llvm/llvm-project/commit/f914208c4388780e24ed3a1ab2e170c53129b2a1
DIFF: https://github.com/llvm/llvm-project/commit/f914208c4388780e24ed3a1ab2e170c53129b2a1.diff

LOG: [OpenMP][NFCI] Avoid storing non-constant values in ICV

If we store a constant in an ICV it is easier for the optimizer to
propagate it. Since we often use the full block for the thread limit and
the parallel team size, we can instead replace that dynamic value with a
constant that otherwise cannot occur, here 0.

Added: 
    

Modified: 
    openmp/libomptarget/DeviceRTL/include/State.h
    openmp/libomptarget/DeviceRTL/src/Mapping.cpp
    openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
    openmp/libomptarget/DeviceRTL/src/State.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
index c269cf1353c85a..3491866c2dc9bf 100644
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -52,6 +52,7 @@ struct ICVStateTy {
   uint32_t NThreadsVar;
   uint32_t LevelVar;
   uint32_t ActiveLevelVar;
+  uint32_t Padding0Val;
   uint32_t MaxActiveLevelsVar;
   uint32_t RunSchedVar;
   uint32_t RunSchedChunkVar;
@@ -339,6 +340,9 @@ void runAndCheckState(void(Func(void)));
 
 void assumeInitialState(bool IsSPMD);
 
+/// Return the value of the ParallelTeamSize ICV.
+int getEffectivePTeamSize();
+
 } // namespace state
 
 namespace icv {

diff  --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index 447b3476ebcc68..78361284ff8de2 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -205,7 +205,6 @@ uint32_t mapping::getThreadIdInWarp() {
 
 uint32_t mapping::getThreadIdInBlock() {
   uint32_t ThreadIdInBlock = impl::getThreadIdInBlock();
-  ASSERT(ThreadIdInBlock < impl::getNumHardwareThreadsInBlock(), nullptr);
   return ThreadIdInBlock;
 }
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index 78cd6c046f9c89..82b944a8bd0a21 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -110,6 +110,8 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   ASSERT(state::HasThreadState == false, nullptr);
 
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  uint32_t BlockSize = mapping::getBlockSize();
+  uint32_t PTeamSize = NumThreads == BlockSize ? 0 : NumThreads;
   if (mapping::isSPMDMode()) {
     // Avoid the race between the read of the `icv::Level` above and the write
     // below by synchronizing all threads here.
@@ -118,7 +120,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
       // Note that the order here is important. `icv::Level` has to be updated
       // last or the other updates will cause a thread specific state to be
       // created.
-      state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
+      state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
                                             1u, TId == 0, ident,
                                             /* ForceTeamState */ true);
       state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0,
@@ -130,7 +132,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
       // team state properly.
       synchronize::threadsAligned(atomic::acq_rel);
 
-      state::ParallelTeamSize.assert_eq(NumThreads, ident,
+      state::ParallelTeamSize.assert_eq(PTeamSize, ident,
                                         /* ForceTeamState */ true);
       icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
       icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);
@@ -139,7 +141,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
       // assumptions above.
       synchronize::threadsAligned(atomic::relaxed);
 
-      if (TId < NumThreads)
+      if (!PTeamSize || TId < PTeamSize)
         invokeMicrotask(TId, 0, fn, args, nargs);
 
       // Synchronize all threads at the end of a parallel region.
@@ -239,7 +241,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
     // Note that the order here is important. `icv::Level` has to be updated
     // last or the other updates will cause a thread specific state to be
     // created.
-    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
+    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
                                           1u, true, ident,
                                           /* ForceTeamState */ true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
@@ -272,7 +274,7 @@ __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
 
   // Set to true for workers participating in the parallel region.
   uint32_t TId = mapping::getThreadIdInBlock();
-  bool ThreadIsActive = TId < state::ParallelTeamSize;
+  bool ThreadIsActive = TId < state::getEffectivePTeamSize();
   return ThreadIsActive;
 }
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index b97d230fd50194..c181478c218db5 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -198,9 +198,10 @@ void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
 }
 
 void state::TeamStateTy::init(bool IsSPMD) {
-  ICVState.NThreadsVar = mapping::getBlockSize(IsSPMD);
+  ICVState.NThreadsVar = 0;
   ICVState.LevelVar = 0;
   ICVState.ActiveLevelVar = 0;
+  ICVState.Padding0Val = 0;
   ICVState.MaxActiveLevelsVar = 1;
   ICVState.RunSchedVar = omp_sched_static;
   ICVState.RunSchedChunkVar = 1;
@@ -312,6 +313,11 @@ void state::assumeInitialState(bool IsSPMD) {
   ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
 }
 
+int state::getEffectivePTeamSize() {
+  int PTeamSize = state::ParallelTeamSize;
+  return PTeamSize ? PTeamSize : mapping::getBlockSize();
+}
+
 extern "C" {
 void omp_set_dynamic(int V) {}
 
@@ -319,7 +325,10 @@ int omp_get_dynamic(void) { return 0; }
 
 void omp_set_num_threads(int V) { icv::NThreads = V; }
 
-int omp_get_max_threads(void) { return icv::NThreads; }
+int omp_get_max_threads(void) {
+  int NT = icv::NThreads;
+  return NT > 0 ? NT : mapping::getBlockSize();
+}
 
 int omp_get_level(void) {
   int LevelVar = icv::Level;
@@ -350,11 +359,11 @@ int omp_get_thread_num(void) {
 }
 
 int omp_get_team_size(int Level) {
-  return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
+  return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
 }
 
 int omp_get_num_threads(void) {
-  return omp_get_level() > 1 ? 1 : state::ParallelTeamSize;
+  return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
 }
 
 int omp_get_thread_limit(void) { return mapping::getBlockSize(); }


        


More information about the Openmp-commits mailing list