[Openmp-commits] [llvm] [openmp] [OpenMP] Add Environment Variable to disable Reuse of Blocks for High Loop Trip Counts (PR #89239)

Tim Gymnich via Openmp-commits openmp-commits at lists.llvm.org
Mon May 6 07:16:33 PDT 2024


https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/89239

>From 6c7fdcc90dc5b9fad489af23f65309471ac1f1c7 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich at icloud.com>
Date: Wed, 10 Apr 2024 18:39:02 +0000
Subject: [PATCH 1/7] Add Environment Variable to disable Reuse of Blocks for
 high Loop Trip Counts

---
 offload/plugins-nextgen/common/include/PluginInterface.h | 9 +++++++++
 offload/plugins-nextgen/common/src/PluginInterface.cpp   | 6 +++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 79e8464bfda5c1..936e090dc07a16 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -829,6 +829,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
     return OMPX_MinThreadsForLowTripCount;
   }
 
+  /// Whether or not to reuse blocks for high trip count loops.
+  /// @see OMPX__ReuseBlocksForHighTripCount
+  virtual bool getReuseBlocksForHighTripCount() {
+    return OMPX__ReuseBlocksForHighTripCount;
+  }
+
   /// Get the total amount of hardware parallelism supported by the target
   /// device. This is the total amount of warps or wavefronts that can be
   /// resident on the device simultaneously.
@@ -904,6 +910,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   UInt32Envar OMPX_MinThreadsForLowTripCount =
       UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
 
+  BoolEnvar OMPX__ReuseBlocksForHighTripCount =
+      BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true);
+
 protected:
   /// Environment variables defined by the LLVM OpenMP implementation
   /// regarding the initial number of streams and events.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index b5f3c45c835fdb..41542ea1123c29 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -705,8 +705,12 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
       TripCountNumBlocks = LoopTripCount;
     }
   }
+
+  uint32_t PreferredNumBlocks = TripCountNumBlocks;
   // If the loops are long running we rather reuse blocks than spawn too many.
-  uint32_t PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
+  if (GenericDevice.getReuseBlocksForHighTripCount()) {
+    PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
+  }
   return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
 }
 

>From a7b554917a7b011718aafba866964d314cbc27e8 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Fri, 19 Apr 2024 14:08:56 +0200
Subject: [PATCH 2/7] Update
 openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp

Co-authored-by: Joseph Huber <huberjn at outlook.com>
---
 offload/plugins-nextgen/common/src/PluginInterface.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 41542ea1123c29..d88ba8a47d2708 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -708,9 +708,8 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
 
   uint32_t PreferredNumBlocks = TripCountNumBlocks;
   // If the loops are long running we rather reuse blocks than spawn too many.
-  if (GenericDevice.getReuseBlocksForHighTripCount()) {
+  if (GenericDevice.getReuseBlocksForHighTripCount())
     PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
-  }
   return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
 }
 

>From 33faa5343c00548207a4a70655ddc57619450923 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich at icloud.com>
Date: Fri, 19 Apr 2024 14:10:05 +0200
Subject: [PATCH 3/7] remove _ and virtual

---
 offload/plugins-nextgen/common/include/PluginInterface.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 936e090dc07a16..1b7a0ca2136e3d 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -830,9 +830,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   }
 
   /// Whether or not to reuse blocks for high trip count loops.
-  /// @see OMPX__ReuseBlocksForHighTripCount
-  virtual bool getReuseBlocksForHighTripCount() {
-    return OMPX__ReuseBlocksForHighTripCount;
+  /// @see OMPX_ReuseBlocksForHighTripCount
+  bool getReuseBlocksForHighTripCount() {
+    return OMPX_ReuseBlocksForHighTripCount;
   }
 
   /// Get the total amount of hardware parallelism supported by the target
@@ -910,7 +910,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   UInt32Envar OMPX_MinThreadsForLowTripCount =
       UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
 
-  BoolEnvar OMPX__ReuseBlocksForHighTripCount =
+  BoolEnvar OMPX_ReuseBlocksForHighTripCount =
       BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true);
 
 protected:

>From 6ff7d746b2ce9224d59c115e6c097c8b4d0dae4d Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich at icloud.com>
Date: Fri, 19 Apr 2024 14:29:21 +0200
Subject: [PATCH 4/7] add test

---
 .../high_trip_count_block_limit.cpp           | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 offload/test/offloading/high_trip_count_block_limit.cpp

diff --git a/offload/test/offloading/high_trip_count_block_limit.cpp b/offload/test/offloading/high_trip_count_block_limit.cpp
new file mode 100644
index 00000000000000..1972188c93e92a
--- /dev/null
+++ b/offload/test/offloading/high_trip_count_block_limit.cpp
@@ -0,0 +1,29 @@
+// RUN: %libomptarget-compilexx-generic && env
+// LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=False %libomptarget-run-generic
+// 2>&1 | %fcheck-generic UNSUPPORTED: aarch64-unknown-linux-gnu UNSUPPORTED:
+// aarch64-unknown-linux-gnu-LTO UNSUPPORTED: x86_64-pc-linux-gnu UNSUPPORTED:
+// x86_64-pc-linux-gnu-LTO UNSUPPORTED: s390x-ibm-linux-gnu UNSUPPORTED:
+// s390x-ibm-linux-gnu-LTO
+
+/*
+  Check if there is a thread for each loop iteration
+*/
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = 819200;
+  int num_threads[N];
+
+#pragma omp target teams distribute parallel for
+  for (int j = 0; j < N; j++) {
+    num_threads[j] = omp_get_num_threads() * omp_get_num_teams();
+  }
+
+  // CHECK: PASS
+  if (num_threads[0] == N)
+    printf("PASS\n");
+  else
+    printf("FAIL: num_threads: %d\n != N: %d", num_threads[0], N);
+  return 0;
+}

>From 7545cf0aa4ceaaa17ef580a67432587a0ea77ce9 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich at icloud.com>
Date: Fri, 19 Apr 2024 16:37:15 +0000
Subject: [PATCH 5/7] add LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT to docs

---
 openmp/docs/design/Runtimes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index f8a8cb87e83e66..520620ddb78735 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -742,6 +742,7 @@ variables is defined below.
     * ``LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
     * ``LIBOMPTARGET_JIT_POST_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
     * ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=<Num> (default: 32)``
+    * ``LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=[TRUE/FALSE] (default TRUE)``
 
 LIBOMPTARGET_DEBUG
 """"""""""""""""""

>From 90e183386d7afe1b3466457caa73c9cf387729dc Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich at icloud.com>
Date: Mon, 6 May 2024 16:09:52 +0200
Subject: [PATCH 6/7] disable clang format and add check for default behaviour

---
 .../high_trip_count_block_limit.cpp           | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/offload/test/offloading/high_trip_count_block_limit.cpp b/offload/test/offloading/high_trip_count_block_limit.cpp
index 1972188c93e92a..4f510f91709a21 100644
--- a/offload/test/offloading/high_trip_count_block_limit.cpp
+++ b/offload/test/offloading/high_trip_count_block_limit.cpp
@@ -1,9 +1,14 @@
-// RUN: %libomptarget-compilexx-generic && env
-// LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=False %libomptarget-run-generic
-// 2>&1 | %fcheck-generic UNSUPPORTED: aarch64-unknown-linux-gnu UNSUPPORTED:
-// aarch64-unknown-linux-gnu-LTO UNSUPPORTED: x86_64-pc-linux-gnu UNSUPPORTED:
-// x86_64-pc-linux-gnu-LTO UNSUPPORTED: s390x-ibm-linux-gnu UNSUPPORTED:
-// s390x-ibm-linux-gnu-LTO
+// clang-format off
+// RUN: %libomptarget-compilexx-generic && env LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=False %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: %libomptarget-compilexx-generic && env LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=False %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
+//
+// UNSUPPORTED: aarch64-unknown-linux-gnu 
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO 
+// UNSUPPORTED: x86_64-pc-linux-gnu 
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO 
+// UNSUPPORTED: s390x-ibm-linux-gnu 
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// clang-format on
 
 /*
   Check if there is a thread for each loop iteration
@@ -20,10 +25,11 @@ int main() {
     num_threads[j] = omp_get_num_threads() * omp_get_num_teams();
   }
 
-  // CHECK: PASS
   if (num_threads[0] == N)
+    // CHECK: PASS
     printf("PASS\n");
   else
+    // DEFAULT: FAIL
     printf("FAIL: num_threads: %d\n != N: %d", num_threads[0], N);
   return 0;
 }

>From 15b8c665909a2b2dc5b707a6d8adabec544366ea Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich at icloud.com>
Date: Mon, 6 May 2024 16:16:14 +0200
Subject: [PATCH 7/7] Fix docs

---
 openmp/docs/design/Runtimes.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 520620ddb78735..98dd984fd4b0c5 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1163,6 +1163,12 @@ of threads possible times the number of teams (aka. blocks) the device prefers
 count to increase outer (team/block) parallelism. The thread count will never
 be reduced below the value passed for this environment variable though.
 
+LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT
+"""""""""""""""""""""""""""""""""""""""""""""
+
+This environment variable can be used to control how the OpenMP runtime assigns
+blocks to loops with high trip counts. By default we reuse existing blocks
+rather than spawning new blocks.
 
 
 .. _libomptarget_plugin:



More information about the Openmp-commits mailing list