[Mlir-commits] [llvm] [mlir] [MLIR][OpenMP] Add Taskloop Collapse Support (PR #175924)

Tue Feb 3 07:10:51 PST 2026

https://github.com/Stylie777 updated https://github.com/llvm/llvm-project/pull/175924

>From 28278648faa69515f058b0b735e9c1a5e31b66d7 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Wed, 14 Jan 2026 11:09:16 +0000
Subject: [PATCH 1/9] [MLIR][OpenMP] Add Taskloop Collapse Support

Following work completed in #174386 and #174623, this patch adds
support for collapse to Taskloop. Collapse allows for the user to
compress multiple loop nests into a single loop, and for this to
work with Taskloop, there needs to be some changes to how we process
the loops, and the tasks that run them.

This patch brings Taskloop equivalent to OpenMP 4.5 support for
MLIR and Flang.
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   3 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  66 ++++++-----
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  30 +++--
 .../LLVMIR/openmp-taskloop-collapse.mlir      | 110 ++++++++++++++++++
 mlir/test/Target/LLVMIR/openmp-todo.mlir      |  14 ---
 5 files changed, 171 insertions(+), 52 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 037fcaa863fe7..e03a5a36e840d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1494,7 +1494,8 @@ class OpenMPIRBuilder {
       Value *LBVal, Value *UBVal, Value *StepVal, bool Untied = false,
       Value *IfCond = nullptr, Value *GrainSize = nullptr, bool NoGroup = false,
       int Sched = 0, Value *Final = nullptr, bool Mergeable = false,
-      Value *Priority = nullptr, TaskDupCallbackTy DupCB = nullptr,
+      Value *Priority = nullptr, int NumOfCollapseLoops = 0,
+      TaskDupCallbackTy DupCB = nullptr,
       Value *TaskContextStructPtrVal = nullptr);
 
   /// Generator for `#omp task`
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 8d7a207a91f5a..23a0c22f06cf2 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2098,7 +2098,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
     llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
     Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
     Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
-    Value *Priority, TaskDupCallbackTy DupCB, Value *TaskContextStructPtrVal) {
+    Value *Priority, int NumOfCollapseLoops, TaskDupCallbackTy DupCB,
+    Value *TaskContextStructPtrVal) {
 
   if (!updateToLocation(Loc))
     return InsertPointTy();
@@ -2176,8 +2177,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
   OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
                       TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
                       IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
-                      FakeStep, Final, Mergeable,
-                      Priority](Function &OutlinedFn) mutable {
+                      FakeStep, Final, Mergeable, Priority,
+                      NumOfCollapseLoops](Function &OutlinedFn) mutable {
     // Replace the Stale CI by appropriate RTL function call.
     assert(OutlinedFn.hasOneUse() &&
            "there must be a single user for the outlined function");
@@ -2360,29 +2361,42 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
     Builder.SetInsertPoint(CLI->getBody(),
                            CLI->getBody()->getFirstInsertionPt());
 
-    // The canonical loop is generated with a fixed lower bound. We need to
-    // update the index calculation code to use the task's lower bound. The
-    // generated code looks like this:
-    // %omp_loop.iv = phi ...
-    // ...
-    // %tmp = mul [type] %omp_loop.iv, step
-    // %user_index = add [type] tmp, lb
-    // OpenMPIRBuilder constructs canonical loops to have exactly three uses of
-    // the normalised induction variable:
-    // 1. This one: converting the normalised IV to the user IV
-    // 2. The increment (add)
-    // 3. The comparison against the trip count (icmp)
-    // (1) is the only use that is a mul followed by an add so this cannot match
-    // other IR.
-    assert(CLI->getIndVar()->getNumUses() == 3 &&
-           "Canonical loop should have exactly three uses of the ind var");
-    for (User *IVUser : CLI->getIndVar()->users()) {
-      if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
-        if (Mul->getOpcode() == Instruction::Mul) {
-          for (User *MulUser : Mul->users()) {
-            if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
-              if (Add->getOpcode() == Instruction::Add) {
-                Add->setOperand(1, CastedTaskLB);
+    if (NumOfCollapseLoops > 1) {
+      // When using the collapse clause, the bounds of the loop have to be
+      // adjusted to
+      Value *IVPlusTaskLB = Builder.CreateAdd(
+          CLI->getIndVar(),
+          Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
+      for (User *IVUser : CLI->getIndVar()->users()) {
+        if (IVUser == IVPlusTaskLB)
+          continue;
+        IVUser->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
+      }
+    } else {
+      // The canonical loop is generated with a fixed lower bound. We need to
+      // update the index calculation code to use the task's lower bound. The
+      // generated code looks like this:
+      // %omp_loop.iv = phi ...
+      // ...
+      // %tmp = mul [type] %omp_loop.iv, step
+      // %user_index = add [type] tmp, lb
+      // OpenMPIRBuilder constructs canonical loops to have exactly three uses
+      // of the normalised induction variable:
+      // 1. This one: converting the normalised IV to the user IV
+      // 2. The increment (add)
+      // 3. The comparison against the trip count (icmp)
+      // (1) is the only use that is a mul followed by an add so this cannot
+      // match other IR.
+      assert(CLI->getIndVar()->getNumUses() == 3 &&
+             "Canonical loop should have exactly three uses of the ind var");
+      for (User *IVUser : CLI->getIndVar()->users()) {
+        if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
+          if (Mul->getOpcode() == Instruction::Mul) {
+            for (User *MulUser : Mul->users()) {
+              if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
+                if (Add->getOpcode() == Instruction::Add) {
+                  Add->setOperand(1, CastedTaskLB);
+                }
               }
             }
           }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 4e7942e382c8b..d6ba3622f4af4 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -329,10 +329,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
     if (op.getBare())
       result = todo("ompx_bare");
   };
-  auto checkCollapse = [&todo](auto op, LogicalResult &result) {
-    if (op.getCollapseNumLoops() > 1)
-      result = todo("collapse");
-  };
   auto checkDepend = [&todo](auto op, LogicalResult &result) {
     if (!op.getDependVars().empty() || op.getDependKinds())
       result = todo("depend");
@@ -383,10 +379,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkAllocate(op, result);
         checkOrder(op, result);
       })
-      .Case([&](omp::LoopNestOp op) {
-        if (mlir::isa<omp::TaskloopOp>(op.getOperation()->getParentOp()))
-          checkCollapse(op, result);
-      })
       .Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); })
       .Case([&](omp::SectionsOp op) {
         checkAllocate(op, result);
@@ -2797,6 +2789,22 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
     return loopInfo;
   };
 
+  llvm::Value *ubVal = builder.getInt32(1);
+  Operation::operand_range lowerBounds = loopOp.getLoopLowerBounds();
+  Operation::operand_range upperBounds = loopOp.getLoopUpperBounds();
+  if (loopOp.getCollapseNumLoops() > 1) {
+    for (uint64_t i = 0; i < loopOp.getCollapseNumLoops(); i++) {
+      ubVal = builder.CreateMul(
+          ubVal,
+          builder.CreateSub(
+              moduleTranslation.lookupValue(upperBounds[i]),
+              builder.CreateSub(moduleTranslation.lookupValue(lowerBounds[i]),
+                                builder.getInt32(1))));
+    }
+  } else {
+    ubVal = moduleTranslation.lookupValue(upperBounds[0]);
+  }
+
   llvm::Value *ifCond = nullptr;
   llvm::Value *grainsize = nullptr;
   int sched = 0; // default
@@ -2830,14 +2838,14 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       moduleTranslation.getOpenMPBuilder()->createTaskloop(
           ompLoc, allocaIP, bodyCB, loopInfo,
-          moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[0]),
-          moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[0]),
+          moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[0]), ubVal,
           moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]),
           taskloopOp.getUntied(), ifCond, grainsize, taskloopOp.getNogroup(),
           sched, moduleTranslation.lookupValue(taskloopOp.getFinal()),
           taskloopOp.getMergeable(),
           moduleTranslation.lookupValue(taskloopOp.getPriority()),
-          taskDupOrNull, taskStructMgr.getStructPtr());
+          loopOp.getCollapseNumLoops(), taskDupOrNull,
+          taskStructMgr.getStructPtr());
 
   if (failed(handleError(afterIP, opInst)))
     return failure();
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
new file mode 100644
index 0000000000000..08729ec7fbd45
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
@@ -0,0 +1,110 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+omp.private {type = private} @_QFtestEi_private_i32 : i32
+
+omp.private {type = firstprivate} @_QFtestEa_firstprivate_i32 : i32 copy {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.load %arg0 : !llvm.ptr -> i32
+  llvm.store %0, %arg1 : i32, !llvm.ptr
+  omp.yield(%arg1 : !llvm.ptr)
+}
+
+
+llvm.func @_QPtest() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %0 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(20 : i32) : i32
+  llvm.store %6, %3 : i32, !llvm.ptr
+  %c1_i32 = llvm.mlir.constant(1 :i32) : i32
+  %c5_i32 = llvm.mlir.constant(5 : i32) : i32
+  %c10_i32 = llvm.mlir.constant(10 : i32) : i32
+  omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    omp.loop_nest (%arg2, %arg3) : i32 = (%c1_i32, %c1_i32) to (%c10_i32, %c5_i32) inclusive step (%c1_i32, %c1_i32) collapse(2) {
+      llvm.store %arg2, %arg1 : i32, !llvm.ptr
+      %10 = llvm.load %arg0 : !llvm.ptr -> i32
+      %11 = llvm.mlir.constant(1 : i32) : i32
+      %12 = llvm.add %10, %11 : i32
+      llvm.store %12, %arg0 : i32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// CHECK: %[[structArg:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[structArg]], i32 0, i32 1
+// CHECK: store i64 50, ptr %[[ub]], align 4
+
+// CHECK: %[[VAL_1:.*]] = load ptr, ptr %0, align 8
+// CHECK: %[[gep_task_lb:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 0
+// CHECK: %[[task_lb:.*]] = load i64, ptr %[[gep_task_lb]], align 4
+// CHECK: %[[gep_task_ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 1
+// CHECK: %[[task_ub:.*]] = load i64, ptr %gep_ub.val, align 4
+// CHECK: %[[VAL_3:.*]] = sub i64 %[[task_ub]], %[[task_lb]]
+// CHECK: %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 1
+// CHECK: %[[trip_cnt:.*]] = add i64 %[[VAL_4]], 1
+// CHECK: %[[VAL_6:.*]] = trunc i64 %[[task_lb]] to i32
+
+// CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
+// CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
+// CHECK: %[[VAL_9:.*]] = urem i32 %omp_collapsed.iv, 5
+// CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 5
+// CHECK: %[[VAL_11:.*]] = mul i32 %[[VAL_10]], 1
+// CHECK: %[[VAL_12:.*]] = add i32 %[[VAL_11]], 1
+// CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_9]], 1
+// CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 1
+
+// -----
+
+llvm.func @_QPtest2() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %0 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(20 : i32) : i32
+  llvm.store %6, %3 : i32, !llvm.ptr
+  %c1_i32 = llvm.mlir.constant(1 :i32) : i32
+  %c2_i32 = llvm.mlir.constant(2 : i32) : i32
+  %c5_i32 = llvm.mlir.constant(5 : i32) : i32
+  %c10_i32 = llvm.mlir.constant(10 : i32) : i32
+  omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    omp.loop_nest (%arg2, %arg3, %arg4) : i32 = (%c1_i32, %c1_i32, %c2_i32) to (%c10_i32, %c5_i32, %c5_i32) inclusive step (%c1_i32, %c1_i32, %c1_i32) collapse(3) {
+      llvm.store %arg2, %arg1 : i32, !llvm.ptr
+      %10 = llvm.load %arg0 : !llvm.ptr -> i32
+      %11 = llvm.mlir.constant(1 : i32) : i32
+      %12 = llvm.add %10, %11 : i32
+      llvm.store %12, %arg0 : i32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// CHECK: %[[structArg:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[structArg]], i32 0, i32 1
+// CHECK: store i64 200, ptr %[[ub]], align 4
+
+// CHECK: %[[VAL_1:.*]] = load ptr, ptr %0, align 8
+// CHECK: %[[gep_task_lb:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 0
+// CHECK: %[[task_lb:.*]] = load i64, ptr %[[gep_task_lb]], align 4
+// CHECK: %[[gep_task_ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 1
+// CHECK: %[[task_ub:.*]] = load i64, ptr %gep_ub.val, align 4
+// CHECK: %[[VAL_3:.*]] = sub i64 %[[task_ub]], %[[task_lb]]
+// CHECK: %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 1
+// CHECK: %[[trip_cnt:.*]] = add i64 %[[VAL_4]], 1
+// CHECK: %[[VAL_6:.*]] = trunc i64 %[[task_lb]] to i32
+
+// CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
+// CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
+// CHECK: %[[VAL_9:.*]] = urem i32 %omp_collapsed.iv, 4
+// CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 4
+// CHECK: %[[VAL_11:.*]] = urem i32 %[[VAL_10]], 5
+// CHECK: %[[VAL_12:.*]] = udiv i32 %[[VAL_10]], 5
+// CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_12]], 1
+// CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 1
+// CHECK: %[[VAL_15:.*]] = mul i32 %[[VAL_11]], 1
+// CHECK: %[[VAL_16:.*]] = add i32 %[[VAL_15]], 1
+// CHECK: %[[VAL_17:.*]] = mul i32 %[[VAL_9]], 1
+// CHECK: %[[VAL_18:.*]] = add i32 %[[VAL_17]], 2
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 9ad22a2a5e80b..a3024de6fa3b8 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -331,20 +331,6 @@ llvm.func @taskloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr)
   llvm.return
 }
 
-// -----
-
-llvm.func @taskloop_collapse(%lb : i32, %ub : i32, %step : i32, %lb1 : i32, %ub1 : i32, %step1 : i32) {
-  // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
-  omp.taskloop {
-    // expected-error at below {{not yet implemented: Unhandled clause collapse in omp.loop_nest operation}}
-    // expected-error at below {{LLVM Translation failed for operation: omp.loop_nest}}
-    omp.loop_nest (%iv, %iv1) : i32 = (%lb, %lb1) to (%ub, %ub1) inclusive step (%step, %step1) collapse(2) {
-      omp.yield
-    }
-  }
-  llvm.return
-}
-
 // -----
  omp.declare_reduction @add_reduction_i32 : i32 init {
   ^bb0(%arg0: i32):

>From 14ee9f4493bc623cd6705b7827a5d14265ee16a6 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Wed, 14 Jan 2026 17:10:44 +0000
Subject: [PATCH 2/9] Respond to Review Comments

---
 llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h      |  5 ++++-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp             |  4 ++--
 .../Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp      | 11 +++++------
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index e03a5a36e840d..8ed4e0ba14502 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1482,6 +1482,9 @@ class OpenMPIRBuilder {
   /// \param Mergeable If the given task is `mergeable`
   /// \param Priority `priority-value' specifies the execution order of the
   ///                 tasks that is generated by the construct
+  /// \param NumOfCollapseLoops Defines the number of loops that are being
+  /// collapsed. The default value is 1, as thats the value when collapse is not
+  /// used.
   /// \param DupCB The callback to generate the duplication code. See
   /// documentation for \ref TaskDupCallbackTy. This can be nullptr.
   /// \param TaskContextStructPtrVal If non-null, a pointer to  to be placed
@@ -1494,7 +1497,7 @@ class OpenMPIRBuilder {
       Value *LBVal, Value *UBVal, Value *StepVal, bool Untied = false,
       Value *IfCond = nullptr, Value *GrainSize = nullptr, bool NoGroup = false,
       int Sched = 0, Value *Final = nullptr, bool Mergeable = false,
-      Value *Priority = nullptr, int NumOfCollapseLoops = 0,
+      Value *Priority = nullptr, uint64_t NumOfCollapseLoops = 1,
       TaskDupCallbackTy DupCB = nullptr,
       Value *TaskContextStructPtrVal = nullptr);
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 23a0c22f06cf2..201a3f0169cc3 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2098,7 +2098,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
     llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
     Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
     Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
-    Value *Priority, int NumOfCollapseLoops, TaskDupCallbackTy DupCB,
+    Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
     Value *TaskContextStructPtrVal) {
 
   if (!updateToLocation(Loc))
@@ -2363,7 +2363,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
 
     if (NumOfCollapseLoops > 1) {
       // When using the collapse clause, the bounds of the loop have to be
-      // adjusted to
+      // adjusted to properly represent the iterator of the outer loop.
       Value *IVPlusTaskLB = Builder.CreateAdd(
           CLI->getIndVar(),
           Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index d6ba3622f4af4..1f2c7d1aa0881 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2794,12 +2794,11 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
   Operation::operand_range upperBounds = loopOp.getLoopUpperBounds();
   if (loopOp.getCollapseNumLoops() > 1) {
     for (uint64_t i = 0; i < loopOp.getCollapseNumLoops(); i++) {
-      ubVal = builder.CreateMul(
-          ubVal,
-          builder.CreateSub(
-              moduleTranslation.lookupValue(upperBounds[i]),
-              builder.CreateSub(moduleTranslation.lookupValue(lowerBounds[i]),
-                                builder.getInt32(1))));
+      llvm::Value *lowerBoundMinusOne = builder.CreateSub(
+          moduleTranslation.lookupValue(lowerBounds[i]), builder.getInt32(1));
+      llvm::Value *loopTripCount = builder.CreateSub(
+          moduleTranslation.lookupValue(upperBounds[i]), lowerBoundMinusOne);
+      ubVal = builder.CreateMul(ubVal, loopTripCount);
     }
   } else {
     ubVal = moduleTranslation.lookupValue(upperBounds[0]);

>From 8ee3c635ef259dbb2287c7ad746840e5b2f01b77 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Thu, 15 Jan 2026 08:26:00 +0000
Subject: [PATCH 3/9] Add comment for upper bounds calculation

---
 .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp       | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 1f2c7d1aa0881..7f7bafb0f3930 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2793,6 +2793,10 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
   Operation::operand_range lowerBounds = loopOp.getLoopLowerBounds();
   Operation::operand_range upperBounds = loopOp.getLoopUpperBounds();
   if (loopOp.getCollapseNumLoops() > 1) {
+    // In cases where Collapse is used with Taskloop, the upper bound of the
+    // iteration space needs to be recalculated to cater for the collapsed loop.
+    // The Collapsed Loop UpperBound is the product of all collapsed
+    // loop's tripcount.
     for (uint64_t i = 0; i < loopOp.getCollapseNumLoops(); i++) {
       llvm::Value *lowerBoundMinusOne = builder.CreateSub(
           moduleTranslation.lookupValue(lowerBounds[i]), builder.getInt32(1));

>From 2f9466d1a2f3b6089c9e0f1d715f444c5b4bf281 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Mon, 19 Jan 2026 09:43:23 +0000
Subject: [PATCH 4/9] adjust how lower bound is handled

The Lower Bound was not being handled correctly, so it needed some
adjusting to work for loops that do not start from zero.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 19 ++++++-
 .../LLVMIR/openmp-taskloop-collapse.mlir      | 56 +++++++++++++++++++
 2 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 7f7bafb0f3930..e1e5afecd9ea3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2792,19 +2792,35 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::Value *ubVal = builder.getInt32(1);
   Operation::operand_range lowerBounds = loopOp.getLoopLowerBounds();
   Operation::operand_range upperBounds = loopOp.getLoopUpperBounds();
+  llvm::Value *lbVal = nullptr;
   if (loopOp.getCollapseNumLoops() > 1) {
     // In cases where Collapse is used with Taskloop, the upper bound of the
     // iteration space needs to be recalculated to cater for the collapsed loop.
     // The Collapsed Loop UpperBound is the product of all collapsed
     // loop's tripcount.
+    // The LowerBound for collapsed loops is always 1. When the loops are
+    // collapsed, it will reset the bounds and add processing in to ensure the
+    // index's are presented as expected. As this happens after creating
+    // Taskloop, these bounds need predicting. Example:
+    // !$omp taskloop collapse(2)
+    //   do i = 1, 10
+    //     do j = 1, 5
+    //       ..
+    //     end do
+    //   end do
+    // This loop above has a total of 50 iterations, so the lb will be 1, and
+    // the ub will be 50. collapseLoops then handles ensuring that i and j are
+    // properly presented when used in the loop.
     for (uint64_t i = 0; i < loopOp.getCollapseNumLoops(); i++) {
       llvm::Value *lowerBoundMinusOne = builder.CreateSub(
           moduleTranslation.lookupValue(lowerBounds[i]), builder.getInt32(1));
       llvm::Value *loopTripCount = builder.CreateSub(
           moduleTranslation.lookupValue(upperBounds[i]), lowerBoundMinusOne);
+      lbVal = builder.getInt32(1);
       ubVal = builder.CreateMul(ubVal, loopTripCount);
     }
   } else {
+    lbVal = moduleTranslation.lookupValue(lowerBounds[0]);
     ubVal = moduleTranslation.lookupValue(upperBounds[0]);
   }
 
@@ -2840,8 +2856,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       moduleTranslation.getOpenMPBuilder()->createTaskloop(
-          ompLoc, allocaIP, bodyCB, loopInfo,
-          moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[0]), ubVal,
+          ompLoc, allocaIP, bodyCB, loopInfo, lbVal, ubVal,
           moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]),
           taskloopOp.getUntied(), ifCond, grainsize, taskloopOp.getNogroup(),
           sched, moduleTranslation.lookupValue(taskloopOp.getFinal()),
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
index 08729ec7fbd45..9ddaff22a9f47 100644
--- a/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
@@ -42,6 +42,7 @@ llvm.func @_QPtest() {
 // CHECK: %[[task_lb:.*]] = load i64, ptr %[[gep_task_lb]], align 4
 // CHECK: %[[gep_task_ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 1
 // CHECK: %[[task_ub:.*]] = load i64, ptr %gep_ub.val, align 4
+
 // CHECK: %[[VAL_3:.*]] = sub i64 %[[task_ub]], %[[task_lb]]
 // CHECK: %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 1
 // CHECK: %[[trip_cnt:.*]] = add i64 %[[VAL_4]], 1
@@ -91,6 +92,7 @@ llvm.func @_QPtest2() {
 // CHECK: %[[task_lb:.*]] = load i64, ptr %[[gep_task_lb]], align 4
 // CHECK: %[[gep_task_ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 1
 // CHECK: %[[task_ub:.*]] = load i64, ptr %gep_ub.val, align 4
+
 // CHECK: %[[VAL_3:.*]] = sub i64 %[[task_ub]], %[[task_lb]]
 // CHECK: %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 1
 // CHECK: %[[trip_cnt:.*]] = add i64 %[[VAL_4]], 1
@@ -108,3 +110,57 @@ llvm.func @_QPtest2() {
 // CHECK: %[[VAL_16:.*]] = add i32 %[[VAL_15]], 1
 // CHECK: %[[VAL_17:.*]] = mul i32 %[[VAL_9]], 1
 // CHECK: %[[VAL_18:.*]] = add i32 %[[VAL_17]], 2
+
+// -----
+
+llvm.func @_QPtest3() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %0 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(20 : i32) : i32
+  llvm.store %6, %3 : i32, !llvm.ptr
+  %c1_i32 = llvm.mlir.constant(1 :i32) : i32
+  %c2_i32 = llvm.mlir.constant(2 : i32) : i32
+  %c5_i32 = llvm.mlir.constant(5 : i32) : i32
+  %c10_i32 = llvm.mlir.constant(10 : i32) : i32
+  %c20_i32 = llvm.mlir.constant(20 : i32) : i32
+  omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    omp.loop_nest (%arg2, %arg3) : i32 = (%c10_i32, %c1_i32) to (%c20_i32, %c5_i32) inclusive step (%c1_i32, %c1_i32) collapse(2) {
+      llvm.store %arg2, %arg1 : i32, !llvm.ptr
+      %10 = llvm.load %arg0 : !llvm.ptr -> i32
+      %11 = llvm.mlir.constant(1 : i32) : i32
+      %12 = llvm.add %10, %11 : i32
+      llvm.store %12, %arg0 : i32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// CHECK: %[[structArg:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[structArg]], i32 0, i32 1
+// CHECK: store i64 55, ptr %[[ub]], align 4
+
+// CHECK: %[[VAL_1:.*]] = load ptr, ptr %0, align 8
+// CHECK: %[[gep_task_lb:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 0
+// CHECK: %[[task_lb:.*]] = load i64, ptr %[[gep_task_lb]], align 4
+// CHECK: %[[gep_task_ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 1
+// CHECK: %[[task_ub:.*]] = load i64, ptr %[[gep_task_ub]], align 4
+
+// CHECK: %[[VAL_3:.*]] = sub i64 %[[task_ub]], %[[task_lb]]
+// CHECK: %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 1
+// CHECK: %[[trip_cnt:.*]] = add i64 %[[VAL_4]], 1
+// CHECK: %[[VAL_5:.*]] = trunc i64 %[[trip_cnt]] to i32
+// CHECK: %6 = trunc i64 %[[task_lb]] to i32
+
+// CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
+// CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
+// CHECK: %[[VAL_9:.*]] = urem i32 %omp_collapsed.iv, 5
+// CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 5
+
+// CHECK: %[[VAL_11:.*]] = mul i32 %[[VAL_10]], 1
+// CHECK: %[[VAL_12:.*]] = add i32 %[[VAL_11]], 10
+
+// CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_9]], 1
+// CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 1

>From b038700f50c934e8cfb0421a2b2d4938780af9dd Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Wed, 21 Jan 2026 09:43:49 +0000
Subject: [PATCH 5/9] Incorporate step values into bounds prediction

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  18 ++-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  27 +++-
 .../LLVMIR/openmp-taskloop-collapse.mlir      | 118 +++++++++++++++++-
 3 files changed, 153 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 201a3f0169cc3..2cd008f724e7b 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2367,10 +2367,20 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
       Value *IVPlusTaskLB = Builder.CreateAdd(
           CLI->getIndVar(),
           Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
-      for (User *IVUser : CLI->getIndVar()->users()) {
-        if (IVUser == IVPlusTaskLB)
-          continue;
-        IVUser->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
+      for (auto IVUse = CLI->getIndVar()->uses().begin();
+           IVUse != CLI->getIndVar()->uses().end();) {
+        User *IVUser = IVUse->getUser();
+        // To ensure every Use is correctly captured, we want to iterate before
+        // replacing the uses of the loop index. If this is done after replacing
+        // the uses, then it is possible for uses to be missed, and values are
+        // not calculated correctly
+        IVUse++;
+        if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
+          if (Op->getOpcode() == Instruction::URem ||
+              Op->getOpcode() == Instruction::UDiv) {
+            IVUser->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
+          }
+        }
       }
     } else {
       // The canonical loop is generated with a fixed lower bound. We need to
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index e1e5afecd9ea3..155d96eda0ffc 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2789,10 +2789,12 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
     return loopInfo;
   };
 
-  llvm::Value *ubVal = builder.getInt32(1);
   Operation::operand_range lowerBounds = loopOp.getLoopLowerBounds();
   Operation::operand_range upperBounds = loopOp.getLoopUpperBounds();
+  Operation::operand_range steps = loopOp.getLoopSteps();
   llvm::Value *lbVal = nullptr;
+  llvm::Value *ubVal = builder.getInt32(1);
+  llvm::Value *stepVal = nullptr;
   if (loopOp.getCollapseNumLoops() > 1) {
     // In cases where Collapse is used with Taskloop, the upper bound of the
     // iteration space needs to be recalculated to cater for the collapsed loop.
@@ -2816,13 +2818,33 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
           moduleTranslation.lookupValue(lowerBounds[i]), builder.getInt32(1));
       llvm::Value *loopTripCount = builder.CreateSub(
           moduleTranslation.lookupValue(upperBounds[i]), lowerBoundMinusOne);
+      // For loops that have a step value greater than 1, we need to adjust the
+      // trip count to ensure the correct number of iterations for the loop is
+      // captured.
+      llvm::Value *loopTripCountDivStep = builder.CreateUDiv(
+          loopTripCount, moduleTranslation.lookupValue(steps[i]));
+      llvm::Value *loopTripCountRem = builder.CreateURem(
+          loopTripCount, moduleTranslation.lookupValue(steps[i]));
+      llvm::Value *needsRoundUp = builder.CreateICmpNE(
+          loopTripCountRem,
+          builder.getIntN(loopTripCountRem->getType()->getIntegerBitWidth(),
+                          0));
+      loopTripCount =
+          builder.CreateAdd(loopTripCountDivStep,
+                            builder.CreateZExtOrTrunc(
+                                needsRoundUp, loopTripCountDivStep->getType()));
       lbVal = builder.getInt32(1);
       ubVal = builder.CreateMul(ubVal, loopTripCount);
+      stepVal = builder.getInt32(1);
     }
   } else {
     lbVal = moduleTranslation.lookupValue(lowerBounds[0]);
     ubVal = moduleTranslation.lookupValue(upperBounds[0]);
+    stepVal = moduleTranslation.lookupValue(steps[0]);
   }
+  assert(lbVal != nullptr && "Expected value for lbVal");
+  assert(ubVal != nullptr && "Expected value for ubVal");
+  assert(stepVal != nullptr && "Expected value for stepVal");
 
   llvm::Value *ifCond = nullptr;
   llvm::Value *grainsize = nullptr;
@@ -2856,8 +2878,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       moduleTranslation.getOpenMPBuilder()->createTaskloop(
-          ompLoc, allocaIP, bodyCB, loopInfo, lbVal, ubVal,
-          moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]),
+          ompLoc, allocaIP, bodyCB, loopInfo, lbVal, ubVal, stepVal,
           taskloopOp.getUntied(), ifCond, grainsize, taskloopOp.getNogroup(),
           sched, moduleTranslation.lookupValue(taskloopOp.getFinal()),
           taskloopOp.getMergeable(),
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
index 9ddaff22a9f47..5d871c28959a1 100644
--- a/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
@@ -50,7 +50,7 @@ llvm.func @_QPtest() {
 
 // CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
 // CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
-// CHECK: %[[VAL_9:.*]] = urem i32 %omp_collapsed.iv, 5
+// CHECK: %[[VAL_9:.*]] = urem i32 %[[VAL_8]], 5
 // CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 5
 // CHECK: %[[VAL_11:.*]] = mul i32 %[[VAL_10]], 1
 // CHECK: %[[VAL_12:.*]] = add i32 %[[VAL_11]], 1
@@ -100,7 +100,7 @@ llvm.func @_QPtest2() {
 
 // CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
 // CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
-// CHECK: %[[VAL_9:.*]] = urem i32 %omp_collapsed.iv, 4
+// CHECK: %[[VAL_9:.*]] = urem i32 %[[VAL_8]], 4
 // CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 4
 // CHECK: %[[VAL_11:.*]] = urem i32 %[[VAL_10]], 5
 // CHECK: %[[VAL_12:.*]] = udiv i32 %[[VAL_10]], 5
@@ -156,7 +156,7 @@ llvm.func @_QPtest3() {
 
 // CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
 // CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
-// CHECK: %[[VAL_9:.*]] = urem i32 %omp_collapsed.iv, 5
+// CHECK: %[[VAL_9:.*]] = urem i32 %[[VAL_8]], 5
 // CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 5
 
 // CHECK: %[[VAL_11:.*]] = mul i32 %[[VAL_10]], 1
@@ -164,3 +164,115 @@ llvm.func @_QPtest3() {
 
 // CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_9]], 1
 // CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 1
+
+// -----
+
+llvm.func @_QPtest4() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %0 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(20 : i32) : i32
+  llvm.store %6, %3 : i32, !llvm.ptr
+  %c1_i32 = llvm.mlir.constant(1 :i32) : i32
+  %c2_i32 = llvm.mlir.constant(2 : i32) : i32
+  %c3_i32 = llvm.mlir.constant(3 : i32) : i32
+  %c5_i32 = llvm.mlir.constant(5 : i32) : i32
+  %c10_i32 = llvm.mlir.constant(10 : i32) : i32
+  %c15_i32 = llvm.mlir.constant(15 : i32) : i32
+  omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    omp.loop_nest (%arg2, %arg3) : i32 = (%c2_i32, %c5_i32) to (%c10_i32, %c15_i32) inclusive step (%c2_i32, %c3_i32) collapse(2) {
+      llvm.store %arg2, %arg1 : i32, !llvm.ptr
+      %10 = llvm.load %arg0 : !llvm.ptr -> i32
+      %11 = llvm.mlir.constant(1 : i32) : i32
+      %12 = llvm.add %10, %11 : i32
+      llvm.store %12, %arg0 : i32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// CHECK: %[[structArg:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[structArg]], i32 0, i32 1
+// CHECK: store i64 20, ptr %[[ub]], align 4
+
+// CHECK: %[[VAL_1:.*]] = load ptr, ptr %0, align 8
+// CHECK: %[[gep_task_lb:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 0
+// CHECK: %[[task_lb:.*]] = load i64, ptr %[[gep_task_lb]], align 4
+// CHECK: %[[gep_task_ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 1
+// CHECK: %[[task_ub:.*]] = load i64, ptr %[[gep_task_ub]], align 4
+
+// CHECK: %[[VAL_3:.*]] = sub i64 %[[task_ub]], %[[task_lb]]
+// CHECK: %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 1
+// CHECK: %[[trip_cnt:.*]] = add i64 %[[VAL_4]], 1
+// CHECK: %[[VAL_5:.*]] = trunc i64 %[[trip_cnt]] to i32
+// CHECK: %6 = trunc i64 %[[task_lb]] to i32
+
+// CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
+// CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
+// CHECK: %[[VAL_9:.*]] = urem i32 %[[VAL_8]], 4
+// CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 4
+
+// CHECK: %[[VAL_11:.*]] = mul i32 %[[VAL_10]], 2
+// CHECK: %[[VAL_12:.*]] = add i32 %[[VAL_11]], 2
+
+// CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_9]], 3
+// CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 5
+
+
+// -----
+
+llvm.func @_QPtest5() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %0 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(20 : i32) : i32
+  llvm.store %6, %3 : i32, !llvm.ptr
+  %cneg2_i32 = llvm.mlir.constant(-2: i32) : i32
+  %c1_i32 = llvm.mlir.constant(1 :i32) : i32
+  %c2_i32 = llvm.mlir.constant(2 : i32) : i32
+  %c3_i32 = llvm.mlir.constant(3 : i32) : i32
+  %c5_i32 = llvm.mlir.constant(5 : i32) : i32
+  %c10_i32 = llvm.mlir.constant(10 : i32) : i32
+  %c15_i32 = llvm.mlir.constant(15 : i32) : i32
+  omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    omp.loop_nest (%arg2, %arg3) : i32 = (%cneg2_i32, %c5_i32) to (%c10_i32, %c15_i32) inclusive step (%c2_i32, %c3_i32) collapse(2) {
+      llvm.store %arg2, %arg1 : i32, !llvm.ptr
+      %10 = llvm.load %arg0 : !llvm.ptr -> i32
+      %11 = llvm.mlir.constant(1 : i32) : i32
+      %12 = llvm.add %10, %11 : i32
+      llvm.store %12, %arg0 : i32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// CHECK: %[[structArg:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[structArg]], i32 0, i32 1
+// CHECK: store i64 28, ptr %[[ub]], align 4
+
+// CHECK: %[[VAL_1:.*]] = load ptr, ptr %0, align 8
+// CHECK: %[[gep_task_lb:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 0
+// CHECK: %[[task_lb:.*]] = load i64, ptr %[[gep_task_lb]], align 4
+// CHECK: %[[gep_task_ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 1
+// CHECK: %[[task_ub:.*]] = load i64, ptr %[[gep_task_ub]], align 4
+
+// CHECK: %[[VAL_3:.*]] = sub i64 %[[task_ub]], %[[task_lb]]
+// CHECK: %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 1
+// CHECK: %[[trip_cnt:.*]] = add i64 %[[VAL_4]], 1
+// CHECK: %[[VAL_5:.*]] = trunc i64 %[[trip_cnt]] to i32
+// CHECK: %6 = trunc i64 %[[task_lb]] to i32
+
+// CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
+// CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
+// CHECK: %[[VAL_9:.*]] = urem i32 %[[VAL_8]], 4
+// CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 4
+
+// CHECK: %[[VAL_11:.*]] = mul i32 %[[VAL_10]], 2
+// CHECK: %[[VAL_12:.*]] = add i32 %[[VAL_11]], -2
+
+// CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_9]], 3
+// CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 5

>From cbe03def9c35fa951ae3b8fe0f8d68116d843f3f Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Tue, 27 Jan 2026 16:32:29 +0000
Subject: [PATCH 6/9] Respond to review comments

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 15 +++---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  2 +-
 .../LLVMIR/openmp-taskloop-collapse.mlir      | 54 +++++++++++++++++++
 3 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2cd008f724e7b..af782c87f0b44 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2362,26 +2362,27 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
                            CLI->getBody()->getFirstInsertionPt());
 
     if (NumOfCollapseLoops > 1) {
+      std::vector<User *> UsersToReplace;
       // When using the collapse clause, the bounds of the loop have to be
       // adjusted to properly represent the iterator of the outer loop.
       Value *IVPlusTaskLB = Builder.CreateAdd(
           CLI->getIndVar(),
           Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
+      // To ensure every Use is correctly captured, we first want to record which
+      // users to replace the value in, and then replace the value.
       for (auto IVUse = CLI->getIndVar()->uses().begin();
-           IVUse != CLI->getIndVar()->uses().end();) {
+           IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
         User *IVUser = IVUse->getUser();
-        // To ensure every Use is correctly captured, we want to iterate before
-        // replacing the uses of the loop index. If this is done after replacing
-        // the uses, then it is possible for uses to be missed, and values are
-        // not calculated correctly
-        IVUse++;
         if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
           if (Op->getOpcode() == Instruction::URem ||
               Op->getOpcode() == Instruction::UDiv) {
-            IVUser->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
+            UsersToReplace.push_back(IVUser);
           }
         }
       }
+      for (User *User : UsersToReplace) {
+        User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
+      }
     } else {
       // The canonical loop is generated with a fixed lower bound. We need to
       // update the index calculation code to use the task's lower bound. The
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 155d96eda0ffc..375ee5c8ab4e6 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2818,7 +2818,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
           moduleTranslation.lookupValue(lowerBounds[i]), builder.getInt32(1));
       llvm::Value *loopTripCount = builder.CreateSub(
           moduleTranslation.lookupValue(upperBounds[i]), lowerBoundMinusOne);
-      // For loops that have a step value greater than 1, we need to adjust the
+      // For loops that have a step value not equal to 1, we need to adjust the
       // trip count to ensure the correct number of iterations for the loop is
       // captured.
       llvm::Value *loopTripCountDivStep = builder.CreateUDiv(
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
index 5d871c28959a1..ad10bf7cb471c 100644
--- a/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
@@ -276,3 +276,57 @@ llvm.func @_QPtest5() {
 
 // CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_9]], 3
 // CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 5
+
+// -----
+
+llvm.func @_QPtest6() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %0 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(20 : i32) : i32
+  llvm.store %6, %3 : i32, !llvm.ptr
+  %cneg1_i32 = llvm.mlir.constant(-1: i32) : i32
+  %c1_i32 = llvm.mlir.constant(1 :i32) : i32
+  %c5_i32 = llvm.mlir.constant(5 : i32) : i32
+  %c10_i32 = llvm.mlir.constant(10 : i32) : i32
+  %c15_i32 = llvm.mlir.constant(15 : i32) : i32
+  omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    omp.loop_nest (%arg2, %arg3) : i32 = (%c5_i32, %c1_i32) to (%c10_i32, %c15_i32) inclusive step (%cneg1_i32, %c1_i32) collapse(2) {
+      llvm.store %arg2, %arg1 : i32, !llvm.ptr
+      %10 = llvm.load %arg0 : !llvm.ptr -> i32
+      %11 = llvm.mlir.constant(1 : i32) : i32
+      %12 = llvm.add %10, %11 : i32
+      llvm.store %12, %arg0 : i32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// CHECK: %[[structArg:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[structArg]], i32 0, i32 1
+// CHECK: store i64 30, ptr %[[ub]], align 4
+
+// CHECK: %[[VAL_1:.*]] = load ptr, ptr %0, align 8
+// CHECK: %[[gep_task_lb:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 0
+// CHECK: %[[task_lb:.*]] = load i64, ptr %[[gep_task_lb]], align 4
+// CHECK: %[[gep_task_ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 1
+// CHECK: %[[task_ub:.*]] = load i64, ptr %[[gep_task_ub]], align 4
+
+// CHECK: %[[VAL_3:.*]] = sub i64 %[[task_ub]], %[[task_lb]]
+// CHECK: %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 1
+// CHECK: %[[trip_cnt:.*]] = add i64 %[[VAL_4]], 1
+// CHECK: %[[VAL_5:.*]] = trunc i64 %[[trip_cnt]] to i32
+// CHECK: %6 = trunc i64 %[[task_lb]] to i32
+
+// CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
+// CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
+// CHECK: %[[VAL_9:.*]] = urem i32 %[[VAL_8]], 15
+// CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 15
+
+// CHECK: %[[VAL_11:.*]] = mul i32 %[[VAL_10]], -1
+// CHECK: %[[VAL_12:.*]] = add i32 %[[VAL_11]], 5
+
+// CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_9]], 1
+// CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 1

>From 6195ceedcfc5da970c776c558cc6d5d01fefbeee Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Thu, 29 Jan 2026 09:53:48 +0000
Subject: [PATCH 7/9] Adjust loop bounds predictions to better handle negative
 step values

---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 37 ++++++++++++-------
 .../LLVMIR/openmp-taskloop-collapse.mlir      |  9 ++---
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 375ee5c8ab4e6..a3dbf7e82a204 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2792,8 +2792,9 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
   Operation::operand_range lowerBounds = loopOp.getLoopLowerBounds();
   Operation::operand_range upperBounds = loopOp.getLoopUpperBounds();
   Operation::operand_range steps = loopOp.getLoopSteps();
+  llvm::Type * boundType = moduleTranslation.lookupValue(lowerBounds[0])->getType();
   llvm::Value *lbVal = nullptr;
-  llvm::Value *ubVal = builder.getInt32(1);
+  llvm::Value *ubVal = builder.getIntN(boundType->getIntegerBitWidth(), 1);
   llvm::Value *stepVal = nullptr;
   if (loopOp.getCollapseNumLoops() > 1) {
     // In cases where Collapse is used with Taskloop, the upper bound of the
@@ -2801,7 +2802,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
     // The Collapsed Loop UpperBound is the product of all collapsed
     // loop's tripcount.
     // The LowerBound for collapsed loops is always 1. When the loops are
-    // collapsed, it will reset the bounds and add processing in to ensure the
+    // collapsed, it will reset the bounds and introduce processing to ensure the
     // index's are presented as expected. As this happens after creating
     // Taskloop, these bounds need predicting. Example:
     // !$omp taskloop collapse(2)
@@ -2811,32 +2812,40 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
     //     end do
     //   end do
     // This loop above has a total of 50 iterations, so the lb will be 1, and
-    // the ub will be 50. collapseLoops then handles ensuring that i and j are
+    // the ub will be 50. collapseLoops in OMPIRBuilder then handles ensuring that i and j are
     // properly presented when used in the loop.
     for (uint64_t i = 0; i < loopOp.getCollapseNumLoops(); i++) {
-      llvm::Value *lowerBoundMinusOne = builder.CreateSub(
-          moduleTranslation.lookupValue(lowerBounds[i]), builder.getInt32(1));
-      llvm::Value *loopTripCount = builder.CreateSub(
-          moduleTranslation.lookupValue(upperBounds[i]), lowerBoundMinusOne);
+      llvm::Value *loopLb = moduleTranslation.lookupValue(lowerBounds[i]);
+      llvm::Value *loopUb = moduleTranslation.lookupValue(upperBounds[i]);
+      llvm::Value *loopStep = moduleTranslation.lookupValue(steps[i]);
+      // In some cases, such as where the ub is less than the lb so the loop steps down, the calculation for the loopTripCount is swapped.
+      // To ensure the correct value is found, calculate both UB - LB and LB - UB then select which value to use depending on how the loop has been configured.
+      llvm::Value *loopLbMinusOne = builder.CreateSub(loopLb, builder.getIntN(boundType->getIntegerBitWidth(), 1));
+      llvm::Value *loopUbMinusOne = builder.CreateSub(loopUb, builder.getIntN(boundType->getIntegerBitWidth(), 1));
+      llvm::Value *boundsCmp = builder.CreateICmpSLT(loopLb, loopUb);
+      llvm::Value *ubMinusLb = builder.CreateSub(loopUb, loopLbMinusOne);
+      llvm::Value *lbMinusUb = builder.CreateSub(loopLb, loopUbMinusOne);
+      llvm::Value *loopTripCount = builder.CreateSelect(boundsCmp, ubMinusLb, lbMinusUb);
+      loopTripCount = builder.CreateBinaryIntrinsic(llvm::Intrinsic::abs, loopTripCount, builder.getFalse());
       // For loops that have a step value not equal to 1, we need to adjust the
       // trip count to ensure the correct number of iterations for the loop is
       // captured.
-      llvm::Value *loopTripCountDivStep = builder.CreateUDiv(
-          loopTripCount, moduleTranslation.lookupValue(steps[i]));
-      llvm::Value *loopTripCountRem = builder.CreateURem(
-          loopTripCount, moduleTranslation.lookupValue(steps[i]));
+      llvm::Value *loopTripCountDivStep = builder.CreateSDiv( loopTripCount, loopStep); 
+      loopTripCountDivStep = builder.CreateBinaryIntrinsic(llvm::Intrinsic::abs, loopTripCountDivStep, builder.getFalse());
+      llvm::Value *loopTripCountRem = builder.CreateSRem(loopTripCount, loopStep);
+      loopTripCountRem = builder.CreateBinaryIntrinsic(llvm::Intrinsic::abs, loopTripCountRem, builder.getFalse());
       llvm::Value *needsRoundUp = builder.CreateICmpNE(
           loopTripCountRem,
           builder.getIntN(loopTripCountRem->getType()->getIntegerBitWidth(),
-                          0));
+                          0)); 
       loopTripCount =
           builder.CreateAdd(loopTripCountDivStep,
                             builder.CreateZExtOrTrunc(
                                 needsRoundUp, loopTripCountDivStep->getType()));
-      lbVal = builder.getInt32(1);
       ubVal = builder.CreateMul(ubVal, loopTripCount);
-      stepVal = builder.getInt32(1);
     }
+    lbVal = builder.getIntN(boundType->getIntegerBitWidth(), 1);
+    stepVal = builder.getIntN(boundType->getIntegerBitWidth(), 1);
   } else {
     lbVal = moduleTranslation.lookupValue(lowerBounds[0]);
     ubVal = moduleTranslation.lookupValue(upperBounds[0]);
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
index ad10bf7cb471c..f0abff7e38869 100644
--- a/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
@@ -290,9 +290,8 @@ llvm.func @_QPtest6() {
   %c1_i32 = llvm.mlir.constant(1 :i32) : i32
   %c5_i32 = llvm.mlir.constant(5 : i32) : i32
   %c10_i32 = llvm.mlir.constant(10 : i32) : i32
-  %c15_i32 = llvm.mlir.constant(15 : i32) : i32
   omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
-    omp.loop_nest (%arg2, %arg3) : i32 = (%c5_i32, %c1_i32) to (%c10_i32, %c15_i32) inclusive step (%cneg1_i32, %c1_i32) collapse(2) {
+    omp.loop_nest (%arg2, %arg3) : i32 = (%c10_i32, %c1_i32) to (%c5_i32, %c5_i32) inclusive step (%cneg1_i32, %c1_i32) collapse(2) {
       llvm.store %arg2, %arg1 : i32, !llvm.ptr
       %10 = llvm.load %arg0 : !llvm.ptr -> i32
       %11 = llvm.mlir.constant(1 : i32) : i32
@@ -322,11 +321,11 @@ llvm.func @_QPtest6() {
 
 // CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
 // CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
-// CHECK: %[[VAL_9:.*]] = urem i32 %[[VAL_8]], 15
-// CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 15
+// CHECK: %[[VAL_9:.*]] = urem i32 %[[VAL_8]], 5
+// CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 5
 
 // CHECK: %[[VAL_11:.*]] = mul i32 %[[VAL_10]], -1
-// CHECK: %[[VAL_12:.*]] = add i32 %[[VAL_11]], 5
+// CHECK: %[[VAL_12:.*]] = add i32 %[[VAL_11]], 10
 
 // CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_9]], 1
 // CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 1

>From 7274ce3ef330df01ab889fff14fcf9427a2e04bc Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Thu, 29 Jan 2026 09:55:19 +0000
Subject: [PATCH 8/9] formatting

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  4 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 44 ++++++++++++-------
 2 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index af782c87f0b44..e491a7066fa06 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2368,8 +2368,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
       Value *IVPlusTaskLB = Builder.CreateAdd(
           CLI->getIndVar(),
           Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
-      // To ensure every Use is correctly captured, we first want to record which
-      // users to replace the value in, and then replace the value.
+      // To ensure every Use is correctly captured, we first want to record
+      // which users to replace the value in, and then replace the value.
       for (auto IVUse = CLI->getIndVar()->uses().begin();
            IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
         User *IVUser = IVUse->getUser();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index a3dbf7e82a204..1671c601698b4 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2792,7 +2792,8 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
   Operation::operand_range lowerBounds = loopOp.getLoopLowerBounds();
   Operation::operand_range upperBounds = loopOp.getLoopUpperBounds();
   Operation::operand_range steps = loopOp.getLoopSteps();
-  llvm::Type * boundType = moduleTranslation.lookupValue(lowerBounds[0])->getType();
+  llvm::Type *boundType =
+      moduleTranslation.lookupValue(lowerBounds[0])->getType();
   llvm::Value *lbVal = nullptr;
   llvm::Value *ubVal = builder.getIntN(boundType->getIntegerBitWidth(), 1);
   llvm::Value *stepVal = nullptr;
@@ -2802,8 +2803,8 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
     // The Collapsed Loop UpperBound is the product of all collapsed
     // loop's tripcount.
     // The LowerBound for collapsed loops is always 1. When the loops are
-    // collapsed, it will reset the bounds and introduce processing to ensure the
-    // index's are presented as expected. As this happens after creating
+    // collapsed, it will reset the bounds and introduce processing to ensure
+    // the index's are presented as expected. As this happens after creating
     // Taskloop, these bounds need predicting. Example:
     // !$omp taskloop collapse(2)
     //   do i = 1, 10
@@ -2812,32 +2813,43 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
     //     end do
     //   end do
     // This loop above has a total of 50 iterations, so the lb will be 1, and
-    // the ub will be 50. collapseLoops in OMPIRBuilder then handles ensuring that i and j are
-    // properly presented when used in the loop.
+    // the ub will be 50. collapseLoops in OMPIRBuilder then handles ensuring
+    // that i and j are properly presented when used in the loop.
     for (uint64_t i = 0; i < loopOp.getCollapseNumLoops(); i++) {
       llvm::Value *loopLb = moduleTranslation.lookupValue(lowerBounds[i]);
       llvm::Value *loopUb = moduleTranslation.lookupValue(upperBounds[i]);
       llvm::Value *loopStep = moduleTranslation.lookupValue(steps[i]);
-      // In some cases, such as where the ub is less than the lb so the loop steps down, the calculation for the loopTripCount is swapped.
-      // To ensure the correct value is found, calculate both UB - LB and LB - UB then select which value to use depending on how the loop has been configured.
-      llvm::Value *loopLbMinusOne = builder.CreateSub(loopLb, builder.getIntN(boundType->getIntegerBitWidth(), 1));
-      llvm::Value *loopUbMinusOne = builder.CreateSub(loopUb, builder.getIntN(boundType->getIntegerBitWidth(), 1));
+      // In some cases, such as where the ub is less than the lb so the loop
+      // steps down, the calculation for the loopTripCount is swapped. To ensure
+      // the correct value is found, calculate both UB - LB and LB - UB then
+      // select which value to use depending on how the loop has been
+      // configured.
+      llvm::Value *loopLbMinusOne = builder.CreateSub(
+          loopLb, builder.getIntN(boundType->getIntegerBitWidth(), 1));
+      llvm::Value *loopUbMinusOne = builder.CreateSub(
+          loopUb, builder.getIntN(boundType->getIntegerBitWidth(), 1));
       llvm::Value *boundsCmp = builder.CreateICmpSLT(loopLb, loopUb);
       llvm::Value *ubMinusLb = builder.CreateSub(loopUb, loopLbMinusOne);
       llvm::Value *lbMinusUb = builder.CreateSub(loopLb, loopUbMinusOne);
-      llvm::Value *loopTripCount = builder.CreateSelect(boundsCmp, ubMinusLb, lbMinusUb);
-      loopTripCount = builder.CreateBinaryIntrinsic(llvm::Intrinsic::abs, loopTripCount, builder.getFalse());
+      llvm::Value *loopTripCount =
+          builder.CreateSelect(boundsCmp, ubMinusLb, lbMinusUb);
+      loopTripCount = builder.CreateBinaryIntrinsic(
+          llvm::Intrinsic::abs, loopTripCount, builder.getFalse());
       // For loops that have a step value not equal to 1, we need to adjust the
       // trip count to ensure the correct number of iterations for the loop is
       // captured.
-      llvm::Value *loopTripCountDivStep = builder.CreateSDiv( loopTripCount, loopStep); 
-      loopTripCountDivStep = builder.CreateBinaryIntrinsic(llvm::Intrinsic::abs, loopTripCountDivStep, builder.getFalse());
-      llvm::Value *loopTripCountRem = builder.CreateSRem(loopTripCount, loopStep);
-      loopTripCountRem = builder.CreateBinaryIntrinsic(llvm::Intrinsic::abs, loopTripCountRem, builder.getFalse());
+      llvm::Value *loopTripCountDivStep =
+          builder.CreateSDiv(loopTripCount, loopStep);
+      loopTripCountDivStep = builder.CreateBinaryIntrinsic(
+          llvm::Intrinsic::abs, loopTripCountDivStep, builder.getFalse());
+      llvm::Value *loopTripCountRem =
+          builder.CreateSRem(loopTripCount, loopStep);
+      loopTripCountRem = builder.CreateBinaryIntrinsic(
+          llvm::Intrinsic::abs, loopTripCountRem, builder.getFalse());
       llvm::Value *needsRoundUp = builder.CreateICmpNE(
           loopTripCountRem,
           builder.getIntN(loopTripCountRem->getType()->getIntegerBitWidth(),
-                          0)); 
+                          0));
       loopTripCount =
           builder.CreateAdd(loopTripCountDivStep,
                             builder.CreateZExtOrTrunc(

>From 75c6c443bcc6fd8ae686c06d0daa4d14596ff423 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Tue, 3 Feb 2026 15:09:30 +0000
Subject: [PATCH 9/9] Change to using llvm::SmallVector

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index e491a7066fa06..b607e813c167f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2362,7 +2362,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
                            CLI->getBody()->getFirstInsertionPt());
 
     if (NumOfCollapseLoops > 1) {
-      std::vector<User *> UsersToReplace;
+      llvm::SmallVector<User *> UsersToReplace;
       // When using the collapse clause, the bounds of the loop have to be
       // adjusted to properly represent the iterator of the outer loop.
       Value *IVPlusTaskLB = Builder.CreateAdd(