[llvm-branch-commits] [llvm] [offload] Fix teams/threads limits in record replay (PR #200639)

Kevin Sala Penades via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Jun 18 00:33:45 PDT 2026


https://github.com/kevinsala updated https://github.com/llvm/llvm-project/pull/200639

>From 3f18e65491f78e289805f35ff32b5f03e4544538 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sat, 30 May 2026 23:45:16 -0700
Subject: [PATCH 1/2] [offload] Fix teams/threads limits in record replay

---
 .../common/include/PluginInterface.h          |  3 ++
 .../common/src/RecordReplay.cpp               | 18 ++++++---
 .../record-replay-diff-teams-threads.cpp      | 37 ++++++++++++++++---
 .../record-replay-diff-threads.cpp            | 13 ++++---
 .../kernelreplay/llvm-omp-kernel-replay.cpp   | 17 ++++++++-
 5 files changed, 69 insertions(+), 19 deletions(-)

diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 6e208bbc7e056..617f17ca0165f 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -449,6 +449,9 @@ struct GenericKernelTy {
   /// Get the size of the static per-block memory consumed by the kernel.
   uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
 
+  /// Get the maximum number of threads per block that this kernel may use.
+  uint32_t getMaxThreads() const { return MaxNumThreads; }
+
   /// Get the kernel image.
   DeviceImageTy &getImage() const {
     assert(ImagePtr && "Kernel is not initialized!");
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index 30539454120b8..69874a76aed16 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -270,18 +270,24 @@ Error NativeRecordReplayTy::recordDescImpl(
   JsonKernelInfo["VAllocAddr"] = (intptr_t)StartAddr;
   JsonKernelInfo["VAllocSize"] = TotalSize;
 
-  // Add minimum and maximum for allowed number of teams. If zero, it means
+  // Export minimum and maximum for allowed number of teams. If zero, it means
   // there was no restriction provided by the program.
+  uint32_t MinMaxBlocks = std::max(KernelArgs.UserNumBlocks[0], uint32_t(0));
   json::Array JsonTeamsLimits;
-  JsonTeamsLimits.push_back(KernelArgs.UserNumBlocks[0]);
-  JsonTeamsLimits.push_back(KernelArgs.UserNumBlocks[0]);
+  JsonTeamsLimits.push_back(MinMaxBlocks);
+  JsonTeamsLimits.push_back(MinMaxBlocks);
   JsonKernelInfo["TeamsLimits"] = json::Value(std::move(JsonTeamsLimits));
 
-  // Add minimum and maximum for allowed number of threads. If zero, it means
+  // Export minimum and maximum for allowed number of threads. If zero, it means
   // there was no restriction provided by the program.
+  uint32_t UserThreads = std::max(KernelArgs.UserThreadLimit[0], uint32_t(0));
+  uint32_t MaxThreads = UserThreads
+                            ? std::min(UserThreads, Kernel.getMaxThreads())
+                            : Kernel.getMaxThreads();
+  assert(MaxThreads >= 0 && "MaxThreads must be greater than zero.");
   json::Array JsonThreadsLimits;
-  JsonThreadsLimits.push_back(uint32_t(KernelArgs.UserThreadLimit[0] > 0));
-  JsonThreadsLimits.push_back(KernelArgs.UserThreadLimit[0]);
+  JsonThreadsLimits.push_back(1);
+  JsonThreadsLimits.push_back(MaxThreads);
   JsonKernelInfo["ThreadsLimits"] = json::Value(std::move(JsonThreadsLimits));
 
   json::Array JsonArgPtrs;
diff --git a/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp b/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp
index 803b633315527..e02d6a4d79efa 100644
--- a/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp
+++ b/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp
@@ -2,12 +2,29 @@
 // RUN: %libomptarget-compilexx-generic
 // RUN: rm -rf %t.testdir
 // RUN: mkdir -p %t.testdir
-// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir %libomptarget-run-generic 2>&1 | %fcheck-generic
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-teams=1 --num-threads=1 {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-teams=2 --num-threads=32 {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-teams=32 --num-threads=64 {}
-// clang-format on
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir LIBOMPTARGET_RECORD_REPORT_FILE=report.txt %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: awk '/\.json/ {print $1}' %t.testdir/report.txt | tr -d ',' > %t.testdir/json_list.txt
+// RUN: cat %t.testdir/json_list.txt | count 2
+// RUN: ls -1 %t.testdir/*.json | count 2
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-teams=1 --num-threads=1 %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-teams=2 --num-threads=32 %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-teams=32 --num-threads=64 %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} not %omp-kernel-replay --verify --num-threads=129 %t.testdir/{} 2>&1 | FileCheck --check-prefix=REPLAY-ERROR1 %s
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=127 %t.testdir/{}
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} not %omp-kernel-replay --verify --num-threads=1024 %t.testdir/{} 2>&1 | FileCheck --check-prefix=REPLAY-ERROR1 %s
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} not %omp-kernel-replay --verify --num-teams=2 %t.testdir/{} 2>&1 | FileCheck --check-prefix=REPLAY-ERROR2 %s
+
+// RUN: %libomptarget-compilexx-generic -mllvm -openmp-ir-builder-use-default-max-threads=0
+// RUN: rm -rf %t.testdir
+// RUN: mkdir -p %t.testdir
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir LIBOMPTARGET_RECORD_REPORT_FILE=report.txt %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: awk '/\.json/ {print $1}' %t.testdir/report.txt | tr -d ',' > %t.testdir/json_list.txt
+// RUN: cat %t.testdir/json_list.txt | count 2
+// RUN: ls -1 %t.testdir/*.json | count 2
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=129 %t.testdir/{}
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=1024 %t.testdir/{}
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} not %omp-kernel-replay --verify --num-threads=2048 %t.testdir/{} 2>&1 | FileCheck --check-prefix=REPLAY-ERROR1 %s
 
 // REQUIRES: gpu
 
@@ -16,6 +33,10 @@
 // UNSUPPORTED: s390x-ibm-linux-gnu
 // UNSUPPORTED: intelgpu
 
+// REPLAY-ERROR1: [llvm-omp-kernel-replay] Error: number of threads ({{[0-9]+}}) is out of the allowed limits (min,max: 1,{{[0-9]+}})
+// REPLAY-ERROR2: [llvm-omp-kernel-replay] Error: number of teams (2) is out of the allowed limits (min,max: 1,1)
+// clang-format on
+
 #include <cstdint>
 #include <cstdio>
 
@@ -33,6 +54,10 @@ int main() {
     Data[I] = 10 + (uint64_t)I;
   }
 
+#pragma omp target
+  {
+  }
+
   uint64_t Sum = 0;
   for (size_t I = 0; I < Size; ++I) {
     Sum += Data[I];
diff --git a/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp b/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp
index 9b65c38f98390..51333f1095124 100644
--- a/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp
+++ b/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp
@@ -2,11 +2,14 @@
 // RUN: %libomptarget-compilexx-generic
 // RUN: rm -rf %t.testdir
 // RUN: mkdir -p %t.testdir
-// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir %libomptarget-run-generic 2>&1 | %fcheck-generic
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-threads=1 {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-threads=32 {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-threads=64 {}
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir LIBOMPTARGET_RECORD_REPORT_FILE=report.txt %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: awk '/\.json/ {print $1}' %t.testdir/report.txt | tr -d ',' > %t.testdir/json_list.txt
+// RUN: cat %t.testdir/json_list.txt | count 1
+// RUN: ls -1 %t.testdir/*.json | count 1
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=1 %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=32 %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=64 %t.testdir/{}
 // clang-format on
 
 // REQUIRES: gpu
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 4335002fd8c77..44fea1d79cb8c 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -209,16 +209,29 @@ Error replayKernel() {
   if (Err)
     return Err;
 
+  // Check that a minimum and maximum have been exported.
   if (TeamsLimits.size() != 2 || ThreadsLimits.size() != 2)
     return createErr("TeamsLimits and ThreadsLimits must have a min and max");
 
+  // Check that the minimum and maximum are specified or both are zero.
+  if (bool(TeamsLimits[0]) != bool(TeamsLimits[1]))
+    return createErr("TeamsLimits min and max are inconsistent");
+  if (bool(ThreadsLimits[0]) != bool(ThreadsLimits[1]))
+    return createErr("ThreadsLimits min and max are inconsistent");
+
   // If the limits were specified, verify the selected values are valid.
   if (TeamsLimits[0] > 0 &&
       (NumTeams < TeamsLimits[0] || NumTeams > TeamsLimits[1]))
-    return createErr("number of teams is out of the allowed limits");
+    return createErr("number of teams (%" PRIu32
+                     ") is out of the allowed limits (min,max: %" PRIu32
+                     ",%" PRIu32 ")",
+                     NumTeams, TeamsLimits[0], TeamsLimits[1]);
   if (ThreadsLimits[0] > 0 &&
       (NumThreads < ThreadsLimits[0] || NumThreads > ThreadsLimits[1]))
-    return createErr("number of threads is out of the allowed limits");
+    return createErr("number of threads (%" PRIu32
+                     ") is out of the allowed limits (min,max: %" PRIu32
+                     ",%" PRIu32 ")",
+                     NumThreads, ThreadsLimits[0], ThreadsLimits[1]);
 
   // Retrieve the arguments of the kernel.
   SmallVector<void *> TgtArgs;

>From dc96a0a1c415ba3a84688f1cf404e7fe500b4b85 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Thu, 18 Jun 2026 00:31:41 -0700
Subject: [PATCH 2/2] [offload] Add flag to ignore limits in kernel replay

---
 offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 44fea1d79cb8c..353bd9d07696d 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -63,6 +63,11 @@ static cl::opt<uint32_t>
                    cl::desc("Set the number of replay repetitions."),
                    cl::init(1), cl::cat(ReplayOptions));
 
+static cl::opt<bool>
+    IgnoreLimitsOpt("ignore-limits",
+                    cl::desc("Ignore thread and team limits (unrecommended)."),
+                    cl::init(false), cl::cat(ReplayOptions));
+
 template <typename... ArgsTy>
 Error createErr(const char *ErrFmt, ArgsTy &&...Args) {
   return llvm::createStringError(llvm::inconvertibleErrorCode(), ErrFmt,
@@ -220,13 +225,13 @@ Error replayKernel() {
     return createErr("ThreadsLimits min and max are inconsistent");
 
   // If the limits were specified, verify the selected values are valid.
-  if (TeamsLimits[0] > 0 &&
+  if (!IgnoreLimitsOpt && TeamsLimits[0] > 0 &&
       (NumTeams < TeamsLimits[0] || NumTeams > TeamsLimits[1]))
     return createErr("number of teams (%" PRIu32
                      ") is out of the allowed limits (min,max: %" PRIu32
                      ",%" PRIu32 ")",
                      NumTeams, TeamsLimits[0], TeamsLimits[1]);
-  if (ThreadsLimits[0] > 0 &&
+  if (!IgnoreLimitsOpt && ThreadsLimits[0] > 0 &&
       (NumThreads < ThreadsLimits[0] || NumThreads > ThreadsLimits[1]))
     return createErr("number of threads (%" PRIu32
                      ") is out of the allowed limits (min,max: %" PRIu32



More information about the llvm-branch-commits mailing list