[llvm] [mlir] [MLIR][OpenMP] Add Taskloop Collapse Support (PR #175924)

Wed Jan 14 09:13:16 PST 2026

https://github.com/Stylie777 updated https://github.com/llvm/llvm-project/pull/175924

>From 28278648faa69515f058b0b735e9c1a5e31b66d7 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Wed, 14 Jan 2026 11:09:16 +0000
Subject: [PATCH 1/2] [MLIR][OpenMP] Add Taskloop Collapse Support

Following work completed in #174386 and #174623, this patch adds
support for collapse to Taskloop. Collapse allows for the user to
compress multiple loop nests into a single loop, and for this to
work with Taskloop, there needs to be some changes to how we process
the loops, and the tasks that run them.

This patch brings Taskloop equivalent to OpenMP 4.5 support for
MLIR and Flang.
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   3 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  66 ++++++-----
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  30 +++--
 .../LLVMIR/openmp-taskloop-collapse.mlir      | 110 ++++++++++++++++++
 mlir/test/Target/LLVMIR/openmp-todo.mlir      |  14 ---
 5 files changed, 171 insertions(+), 52 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 037fcaa863fe7..e03a5a36e840d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1494,7 +1494,8 @@ class OpenMPIRBuilder {
       Value *LBVal, Value *UBVal, Value *StepVal, bool Untied = false,
       Value *IfCond = nullptr, Value *GrainSize = nullptr, bool NoGroup = false,
       int Sched = 0, Value *Final = nullptr, bool Mergeable = false,
-      Value *Priority = nullptr, TaskDupCallbackTy DupCB = nullptr,
+      Value *Priority = nullptr, int NumOfCollapseLoops = 0,
+      TaskDupCallbackTy DupCB = nullptr,
       Value *TaskContextStructPtrVal = nullptr);
 
   /// Generator for `#omp task`
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 8d7a207a91f5a..23a0c22f06cf2 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2098,7 +2098,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
     llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
     Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
     Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
-    Value *Priority, TaskDupCallbackTy DupCB, Value *TaskContextStructPtrVal) {
+    Value *Priority, int NumOfCollapseLoops, TaskDupCallbackTy DupCB,
+    Value *TaskContextStructPtrVal) {
 
   if (!updateToLocation(Loc))
     return InsertPointTy();
@@ -2176,8 +2177,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
   OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
                       TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
                       IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
-                      FakeStep, Final, Mergeable,
-                      Priority](Function &OutlinedFn) mutable {
+                      FakeStep, Final, Mergeable, Priority,
+                      NumOfCollapseLoops](Function &OutlinedFn) mutable {
     // Replace the Stale CI by appropriate RTL function call.
     assert(OutlinedFn.hasOneUse() &&
            "there must be a single user for the outlined function");
@@ -2360,29 +2361,42 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
     Builder.SetInsertPoint(CLI->getBody(),
                            CLI->getBody()->getFirstInsertionPt());
 
-    // The canonical loop is generated with a fixed lower bound. We need to
-    // update the index calculation code to use the task's lower bound. The
-    // generated code looks like this:
-    // %omp_loop.iv = phi ...
-    // ...
-    // %tmp = mul [type] %omp_loop.iv, step
-    // %user_index = add [type] tmp, lb
-    // OpenMPIRBuilder constructs canonical loops to have exactly three uses of
-    // the normalised induction variable:
-    // 1. This one: converting the normalised IV to the user IV
-    // 2. The increment (add)
-    // 3. The comparison against the trip count (icmp)
-    // (1) is the only use that is a mul followed by an add so this cannot match
-    // other IR.
-    assert(CLI->getIndVar()->getNumUses() == 3 &&
-           "Canonical loop should have exactly three uses of the ind var");
-    for (User *IVUser : CLI->getIndVar()->users()) {
-      if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
-        if (Mul->getOpcode() == Instruction::Mul) {
-          for (User *MulUser : Mul->users()) {
-            if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
-              if (Add->getOpcode() == Instruction::Add) {
-                Add->setOperand(1, CastedTaskLB);
+    if (NumOfCollapseLoops > 1) {
+      // When using the collapse clause, the bounds of the loop have to be
+      // adjusted to
+      Value *IVPlusTaskLB = Builder.CreateAdd(
+          CLI->getIndVar(),
+          Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
+      for (User *IVUser : CLI->getIndVar()->users()) {
+        if (IVUser == IVPlusTaskLB)
+          continue;
+        IVUser->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
+      }
+    } else {
+      // The canonical loop is generated with a fixed lower bound. We need to
+      // update the index calculation code to use the task's lower bound. The
+      // generated code looks like this:
+      // %omp_loop.iv = phi ...
+      // ...
+      // %tmp = mul [type] %omp_loop.iv, step
+      // %user_index = add [type] tmp, lb
+      // OpenMPIRBuilder constructs canonical loops to have exactly three uses
+      // of the normalised induction variable:
+      // 1. This one: converting the normalised IV to the user IV
+      // 2. The increment (add)
+      // 3. The comparison against the trip count (icmp)
+      // (1) is the only use that is a mul followed by an add so this cannot
+      // match other IR.
+      assert(CLI->getIndVar()->getNumUses() == 3 &&
+             "Canonical loop should have exactly three uses of the ind var");
+      for (User *IVUser : CLI->getIndVar()->users()) {
+        if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
+          if (Mul->getOpcode() == Instruction::Mul) {
+            for (User *MulUser : Mul->users()) {
+              if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
+                if (Add->getOpcode() == Instruction::Add) {
+                  Add->setOperand(1, CastedTaskLB);
+                }
               }
             }
           }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 4e7942e382c8b..d6ba3622f4af4 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -329,10 +329,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
     if (op.getBare())
       result = todo("ompx_bare");
   };
-  auto checkCollapse = [&todo](auto op, LogicalResult &result) {
-    if (op.getCollapseNumLoops() > 1)
-      result = todo("collapse");
-  };
   auto checkDepend = [&todo](auto op, LogicalResult &result) {
     if (!op.getDependVars().empty() || op.getDependKinds())
       result = todo("depend");
@@ -383,10 +379,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkAllocate(op, result);
         checkOrder(op, result);
       })
-      .Case([&](omp::LoopNestOp op) {
-        if (mlir::isa<omp::TaskloopOp>(op.getOperation()->getParentOp()))
-          checkCollapse(op, result);
-      })
       .Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); })
       .Case([&](omp::SectionsOp op) {
         checkAllocate(op, result);
@@ -2797,6 +2789,22 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
     return loopInfo;
   };
 
+  llvm::Value *ubVal = builder.getInt32(1);
+  Operation::operand_range lowerBounds = loopOp.getLoopLowerBounds();
+  Operation::operand_range upperBounds = loopOp.getLoopUpperBounds();
+  if (loopOp.getCollapseNumLoops() > 1) {
+    for (uint64_t i = 0; i < loopOp.getCollapseNumLoops(); i++) {
+      ubVal = builder.CreateMul(
+          ubVal,
+          builder.CreateSub(
+              moduleTranslation.lookupValue(upperBounds[i]),
+              builder.CreateSub(moduleTranslation.lookupValue(lowerBounds[i]),
+                                builder.getInt32(1))));
+    }
+  } else {
+    ubVal = moduleTranslation.lookupValue(upperBounds[0]);
+  }
+
   llvm::Value *ifCond = nullptr;
   llvm::Value *grainsize = nullptr;
   int sched = 0; // default
@@ -2830,14 +2838,14 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       moduleTranslation.getOpenMPBuilder()->createTaskloop(
           ompLoc, allocaIP, bodyCB, loopInfo,
-          moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[0]),
-          moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[0]),
+          moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[0]), ubVal,
           moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]),
           taskloopOp.getUntied(), ifCond, grainsize, taskloopOp.getNogroup(),
           sched, moduleTranslation.lookupValue(taskloopOp.getFinal()),
           taskloopOp.getMergeable(),
           moduleTranslation.lookupValue(taskloopOp.getPriority()),
-          taskDupOrNull, taskStructMgr.getStructPtr());
+          loopOp.getCollapseNumLoops(), taskDupOrNull,
+          taskStructMgr.getStructPtr());
 
   if (failed(handleError(afterIP, opInst)))
     return failure();
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
new file mode 100644
index 0000000000000..08729ec7fbd45
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop-collapse.mlir
@@ -0,0 +1,110 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+omp.private {type = private} @_QFtestEi_private_i32 : i32
+
+omp.private {type = firstprivate} @_QFtestEa_firstprivate_i32 : i32 copy {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.load %arg0 : !llvm.ptr -> i32
+  llvm.store %0, %arg1 : i32, !llvm.ptr
+  omp.yield(%arg1 : !llvm.ptr)
+}
+
+
+llvm.func @_QPtest() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %0 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(20 : i32) : i32
+  llvm.store %6, %3 : i32, !llvm.ptr
+  %c1_i32 = llvm.mlir.constant(1 :i32) : i32
+  %c5_i32 = llvm.mlir.constant(5 : i32) : i32
+  %c10_i32 = llvm.mlir.constant(10 : i32) : i32
+  omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    omp.loop_nest (%arg2, %arg3) : i32 = (%c1_i32, %c1_i32) to (%c10_i32, %c5_i32) inclusive step (%c1_i32, %c1_i32) collapse(2) {
+      llvm.store %arg2, %arg1 : i32, !llvm.ptr
+      %10 = llvm.load %arg0 : !llvm.ptr -> i32
+      %11 = llvm.mlir.constant(1 : i32) : i32
+      %12 = llvm.add %10, %11 : i32
+      llvm.store %12, %arg0 : i32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// CHECK: %[[structArg:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[structArg]], i32 0, i32 1
+// CHECK: store i64 50, ptr %[[ub]], align 4
+
+// CHECK: %[[VAL_1:.*]] = load ptr, ptr %0, align 8
+// CHECK: %[[gep_task_lb:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 0
+// CHECK: %[[task_lb:.*]] = load i64, ptr %[[gep_task_lb]], align 4
+// CHECK: %[[gep_task_ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 1
+// CHECK: %[[task_ub:.*]] = load i64, ptr %gep_ub.val, align 4
+// CHECK: %[[VAL_3:.*]] = sub i64 %[[task_ub]], %[[task_lb]]
+// CHECK: %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 1
+// CHECK: %[[trip_cnt:.*]] = add i64 %[[VAL_4]], 1
+// CHECK: %[[VAL_6:.*]] = trunc i64 %[[task_lb]] to i32
+
+// CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
+// CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
+// CHECK: %[[VAL_9:.*]] = urem i32 %omp_collapsed.iv, 5
+// CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 5
+// CHECK: %[[VAL_11:.*]] = mul i32 %[[VAL_10]], 1
+// CHECK: %[[VAL_12:.*]] = add i32 %[[VAL_11]], 1
+// CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_9]], 1
+// CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 1
+
+// -----
+
+llvm.func @_QPtest2() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %0 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(20 : i32) : i32
+  llvm.store %6, %3 : i32, !llvm.ptr
+  %c1_i32 = llvm.mlir.constant(1 :i32) : i32
+  %c2_i32 = llvm.mlir.constant(2 : i32) : i32
+  %c5_i32 = llvm.mlir.constant(5 : i32) : i32
+  %c10_i32 = llvm.mlir.constant(10 : i32) : i32
+  omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+    omp.loop_nest (%arg2, %arg3, %arg4) : i32 = (%c1_i32, %c1_i32, %c2_i32) to (%c10_i32, %c5_i32, %c5_i32) inclusive step (%c1_i32, %c1_i32, %c1_i32) collapse(3) {
+      llvm.store %arg2, %arg1 : i32, !llvm.ptr
+      %10 = llvm.load %arg0 : !llvm.ptr -> i32
+      %11 = llvm.mlir.constant(1 : i32) : i32
+      %12 = llvm.add %10, %11 : i32
+      llvm.store %12, %arg0 : i32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// CHECK: %[[structArg:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[structArg]], i32 0, i32 1
+// CHECK: store i64 200, ptr %[[ub]], align 4
+
+// CHECK: %[[VAL_1:.*]] = load ptr, ptr %0, align 8
+// CHECK: %[[gep_task_lb:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 0
+// CHECK: %[[task_lb:.*]] = load i64, ptr %[[gep_task_lb]], align 4
+// CHECK: %[[gep_task_ub:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_1]], i32 0, i32 1
+// CHECK: %[[task_ub:.*]] = load i64, ptr %gep_ub.val, align 4
+// CHECK: %[[VAL_3:.*]] = sub i64 %[[task_ub]], %[[task_lb]]
+// CHECK: %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 1
+// CHECK: %[[trip_cnt:.*]] = add i64 %[[VAL_4]], 1
+// CHECK: %[[VAL_6:.*]] = trunc i64 %[[task_lb]] to i32
+
+// CHECK: %[[VAL_7:.*]] = sub i32 %[[VAL_6]], 1
+// CHECK: %[[VAL_8:.*]] = add i32 %omp_collapsed.iv, %[[VAL_7]]
+// CHECK: %[[VAL_9:.*]] = urem i32 %omp_collapsed.iv, 4
+// CHECK: %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 4
+// CHECK: %[[VAL_11:.*]] = urem i32 %[[VAL_10]], 5
+// CHECK: %[[VAL_12:.*]] = udiv i32 %[[VAL_10]], 5
+// CHECK: %[[VAL_13:.*]] = mul i32 %[[VAL_12]], 1
+// CHECK: %[[VAL_14:.*]] = add i32 %[[VAL_13]], 1
+// CHECK: %[[VAL_15:.*]] = mul i32 %[[VAL_11]], 1
+// CHECK: %[[VAL_16:.*]] = add i32 %[[VAL_15]], 1
+// CHECK: %[[VAL_17:.*]] = mul i32 %[[VAL_9]], 1
+// CHECK: %[[VAL_18:.*]] = add i32 %[[VAL_17]], 2
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 9ad22a2a5e80b..a3024de6fa3b8 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -331,20 +331,6 @@ llvm.func @taskloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr)
   llvm.return
 }
 
-// -----
-
-llvm.func @taskloop_collapse(%lb : i32, %ub : i32, %step : i32, %lb1 : i32, %ub1 : i32, %step1 : i32) {
-  // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
-  omp.taskloop {
-    // expected-error at below {{not yet implemented: Unhandled clause collapse in omp.loop_nest operation}}
-    // expected-error at below {{LLVM Translation failed for operation: omp.loop_nest}}
-    omp.loop_nest (%iv, %iv1) : i32 = (%lb, %lb1) to (%ub, %ub1) inclusive step (%step, %step1) collapse(2) {
-      omp.yield
-    }
-  }
-  llvm.return
-}
-
 // -----
  omp.declare_reduction @add_reduction_i32 : i32 init {
   ^bb0(%arg0: i32):

>From 14ee9f4493bc623cd6705b7827a5d14265ee16a6 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Wed, 14 Jan 2026 17:10:44 +0000
Subject: [PATCH 2/2] Respond to Review Comments

---
 llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h      |  5 ++++-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp             |  4 ++--
 .../Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp      | 11 +++++------
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index e03a5a36e840d..8ed4e0ba14502 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1482,6 +1482,9 @@ class OpenMPIRBuilder {
   /// \param Mergeable If the given task is `mergeable`
   /// \param Priority `priority-value' specifies the execution order of the
   ///                 tasks that is generated by the construct
+  /// \param NumOfCollapseLoops Defines the number of loops that are being
+  /// collapsed. The default value is 1, as thats the value when collapse is not
+  /// used.
   /// \param DupCB The callback to generate the duplication code. See
   /// documentation for \ref TaskDupCallbackTy. This can be nullptr.
   /// \param TaskContextStructPtrVal If non-null, a pointer to  to be placed
@@ -1494,7 +1497,7 @@ class OpenMPIRBuilder {
       Value *LBVal, Value *UBVal, Value *StepVal, bool Untied = false,
       Value *IfCond = nullptr, Value *GrainSize = nullptr, bool NoGroup = false,
       int Sched = 0, Value *Final = nullptr, bool Mergeable = false,
-      Value *Priority = nullptr, int NumOfCollapseLoops = 0,
+      Value *Priority = nullptr, uint64_t NumOfCollapseLoops = 1,
       TaskDupCallbackTy DupCB = nullptr,
       Value *TaskContextStructPtrVal = nullptr);
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 23a0c22f06cf2..201a3f0169cc3 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2098,7 +2098,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
     llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
     Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
     Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
-    Value *Priority, int NumOfCollapseLoops, TaskDupCallbackTy DupCB,
+    Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
     Value *TaskContextStructPtrVal) {
 
   if (!updateToLocation(Loc))
@@ -2363,7 +2363,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
 
     if (NumOfCollapseLoops > 1) {
       // When using the collapse clause, the bounds of the loop have to be
-      // adjusted to
+      // adjusted to properly represent the iterator of the outer loop.
       Value *IVPlusTaskLB = Builder.CreateAdd(
           CLI->getIndVar(),
           Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index d6ba3622f4af4..1f2c7d1aa0881 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2794,12 +2794,11 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
   Operation::operand_range upperBounds = loopOp.getLoopUpperBounds();
   if (loopOp.getCollapseNumLoops() > 1) {
     for (uint64_t i = 0; i < loopOp.getCollapseNumLoops(); i++) {
-      ubVal = builder.CreateMul(
-          ubVal,
-          builder.CreateSub(
-              moduleTranslation.lookupValue(upperBounds[i]),
-              builder.CreateSub(moduleTranslation.lookupValue(lowerBounds[i]),
-                                builder.getInt32(1))));
+      llvm::Value *lowerBoundMinusOne = builder.CreateSub(
+          moduleTranslation.lookupValue(lowerBounds[i]), builder.getInt32(1));
+      llvm::Value *loopTripCount = builder.CreateSub(
+          moduleTranslation.lookupValue(upperBounds[i]), lowerBoundMinusOne);
+      ubVal = builder.CreateMul(ubVal, loopTripCount);
     }
   } else {
     ubVal = moduleTranslation.lookupValue(upperBounds[0]);