[llvm] 7481b46 - [OpenMP] Use default grid value for static grid size

Wed Aug 23 11:13:18 PDT 2023

Author: Johannes Doerfert
Date: 2023-08-23T11:12:03-07:00
New Revision: 7481b465ae3000589c1360c5c039e73b1d789783

URL: https://github.com/llvm/llvm-project/commit/7481b465ae3000589c1360c5c039e73b1d789783
DIFF: https://github.com/llvm/llvm-project/commit/7481b465ae3000589c1360c5c039e73b1d789783.diff

LOG: [OpenMP] Use default grid value for static grid size

If the user did not provide any static clause to override the grid size,
we assume the default grid size as upper bound and use it to improve
code generation through vendor specific attributes.

Fixes: https://github.com/llvm/llvm-project/issues/64816

Differential Revision: https://reviews.llvm.org/D158382

Added: 
    openmp/libomptarget/test/offloading/default_thread_limit.c

Modified: 
    llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index cba1336165b579..f628080ceb86c4 100644

--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallingConv.h"
@@ -37,6 +38,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -4121,6 +4123,20 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc) {
   Builder.CreateCall(Fn, {});
 }
 
+static const omp::GV &getGridValue(Function *Kernel) {
+  if (Kernel->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+    StringRef Features =
+        Kernel->getFnAttribute("target-features").getValueAsString();
+    if (Features.count("+wavefrontsize64"))
+      return omp::getAMDGPUGridValues<64>();
+    return omp::getAMDGPUGridValues<32>();
+  }
+  if (Triple(Kernel->getParent()->getTargetTriple()).isNVPTX())
+
+    return omp::NVPTXGridValues;
+  llvm_unreachable("No grid value available for this architecture!");
+}
+
 void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
     Function *OutlinedFn, int32_t NumTeams, int32_t NumThreads) {
   if (Config.isTargetDevice()) {
@@ -4135,6 +4151,9 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
   if (NumTeams > 0)
     OutlinedFn->addFnAttr("omp_target_num_teams", std::to_string(NumTeams));
 
+  if (NumThreads == -1 && Config.isGPU())
+    NumThreads = getGridValue(OutlinedFn).GV_Default_WG_Size;
+
   if (NumThreads > 0) {
     if (OutlinedFn->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
       OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",

diff  --git a/openmp/libomptarget/test/offloading/default_thread_limit.c b/openmp/libomptarget/test/offloading/default_thread_limit.c
new file mode 100644
index 00000000000000..73c7e08ccaed49
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/default_thread_limit.c
@@ -0,0 +1,103 @@
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+__attribute__((optnone)) int optnone() { return 1; }
+
+int main() {
+  int N = optnone() * 4098 * 32;
+
+// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
+#pragma omp target 
+#pragma omp teams distribute parallel for
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 42 (MaxFlatWorkGroupSize: 1024
+#pragma omp target thread_limit(optnone() * 42)
+#pragma omp teams distribute parallel for
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
+#pragma omp target thread_limit(optnone() * 42) ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
+#pragma omp teams distribute parallel for
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// FIXME: Use the attribute value to imply a thread_limit
+// DEFAULT: {{(128|256)}} (MaxFlatWorkGroupSize: 42
+#pragma omp target ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
+#pragma omp teams distribute parallel for
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: MaxFlatWorkGroupSize: 1024
+#pragma omp target
+#pragma omp teams distribute parallel for num_threads(optnone() * 42)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: MaxFlatWorkGroupSize: 1024
+#pragma omp target teams distribute parallel for thread_limit(optnone() * 42)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: MaxFlatWorkGroupSize: 1024
+#pragma omp target teams distribute parallel for num_threads(optnone() * 42)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 9 (MaxFlatWorkGroupSize: 9
+#pragma omp target
+#pragma omp teams distribute parallel for num_threads(9)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
+#pragma omp target thread_limit(4)
+#pragma omp teams distribute parallel for
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
+#pragma omp target
+#pragma omp teams distribute parallel for thread_limit(4)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 9 (MaxFlatWorkGroupSize: 9
+#pragma omp target teams distribute parallel for num_threads(9)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
+#pragma omp target teams distribute parallel for simd thread_limit(4)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+}
+