[Openmp-commits] [openmp] 597d2f7 - [OpenMP] Add Environment Variable to disable Reuse of Blocks for High Loop Trip Counts (#89239)

via Openmp-commits openmp-commits at lists.llvm.org
Fri Jun 14 07:35:27 PDT 2024


Author: Tim Gymnich
Date: 2024-06-14T07:35:23-07:00
New Revision: 597d2f7662c31cae4c8a54cc27e2ea12833380ea

URL: https://github.com/llvm/llvm-project/commit/597d2f7662c31cae4c8a54cc27e2ea12833380ea
DIFF: https://github.com/llvm/llvm-project/commit/597d2f7662c31cae4c8a54cc27e2ea12833380ea.diff

LOG: [OpenMP] Add Environment Variable to disable Reuse of Blocks for High Loop Trip Counts (#89239)

Sometimes it might be beneficial to spawn more thread blocks instead of
reusing existing for multiple loop iterations.

**Alternatives considered:**

Make `DefaultNumBlocks` settable via an environment variable.

---------

Co-authored-by: Joseph Huber <huberjn at outlook.com>

Added: 
    offload/test/offloading/high_trip_count_block_limit.cpp

Modified: 
    offload/plugins-nextgen/common/include/PluginInterface.h
    offload/plugins-nextgen/common/src/PluginInterface.cpp
    openmp/docs/design/Runtimes.rst

Removed: 
    


################################################################################
diff  --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 0d2a36a42d5fa..973add0ba1000 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -826,6 +826,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
     return OMPX_MinThreadsForLowTripCount;
   }
 
+  /// Whether or not to reuse blocks for high trip count loops.
+  /// @see OMPX_ReuseBlocksForHighTripCount
+  bool getReuseBlocksForHighTripCount() {
+    return OMPX_ReuseBlocksForHighTripCount;
+  }
+
   /// Get the total amount of hardware parallelism supported by the target
   /// device. This is the total amount of warps or wavefronts that can be
   /// resident on the device simultaneously.
@@ -901,6 +907,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   UInt32Envar OMPX_MinThreadsForLowTripCount =
       UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
 
+  BoolEnvar OMPX_ReuseBlocksForHighTripCount =
+      BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true);
+
 protected:
   /// Environment variables defined by the LLVM OpenMP implementation
   /// regarding the initial number of streams and events.

diff  --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 94f9d4670b672..118265973f327 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -701,8 +701,11 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
       TripCountNumBlocks = LoopTripCount;
     }
   }
+
+  uint32_t PreferredNumBlocks = TripCountNumBlocks;
   // If the loops are long running we rather reuse blocks than spawn too many.
-  uint32_t PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
+  if (GenericDevice.getReuseBlocksForHighTripCount())
+    PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
   return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
 }
 

diff  --git a/offload/test/offloading/high_trip_count_block_limit.cpp b/offload/test/offloading/high_trip_count_block_limit.cpp
new file mode 100644
index 0000000000000..d0e39274e27d6
--- /dev/null
+++ b/offload/test/offloading/high_trip_count_block_limit.cpp
@@ -0,0 +1,35 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic && env LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=False %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: %libomptarget-compilexx-generic && %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu 
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO 
+// UNSUPPORTED: x86_64-pc-linux-gnu 
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO 
+// UNSUPPORTED: s390x-ibm-linux-gnu 
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// clang-format on
+
+/*
+  Check if there is a thread for each loop iteration
+*/
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 819200;
+  int num_threads[N];
+
+#pragma omp target teams distribute parallel for
+  for (int j = 0; j < N; j++) {
+    num_threads[j] = omp_get_num_threads() * omp_get_num_teams();
+  }
+
+  if (num_threads[0] == N)
+    // CHECK: PASS
+    printf("PASS\n");
+  else
+    // DEFAULT: FAIL
+    printf("FAIL: num_threads: %d\n != N: %d", num_threads[0], N);
+  return 0;
+}

diff  --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index f8a8cb87e83e6..98dd984fd4b0c 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -742,6 +742,7 @@ variables is defined below.
     * ``LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
     * ``LIBOMPTARGET_JIT_POST_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
     * ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=<Num> (default: 32)``
+    * ``LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=[TRUE/FALSE] (default TRUE)``
 
 LIBOMPTARGET_DEBUG
 """"""""""""""""""
@@ -1162,6 +1163,12 @@ of threads possible times the number of teams (aka. blocks) the device prefers
 count to increase outer (team/block) parallelism. The thread count will never
 be reduced below the value passed for this environment variable though.
 
+LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT
+"""""""""""""""""""""""""""""""""""""""""""""
+
+This environment variable can be used to control how the OpenMP runtime assigns
+blocks to loops with high trip counts. By default we reuse existing blocks
+rather than spawning new blocks.
 
 
 .. _libomptarget_plugin:


        


More information about the Openmp-commits mailing list