[llvm-branch-commits] [mlir] [MLIR][OpenMP] Update omp.wsloop translation to LLVM IR (4/5) (PR #89214)

Sergio Afonso via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Apr 18 03:54:10 PDT 2024


https://github.com/skatrak created https://github.com/llvm/llvm-project/pull/89214

This patch introduces minimal changes to the MLIR to LLVM IR translation of omp.wsloop to support the loop wrapper approach.

There is `omp.loop_nest` related translation code that should be extracted and shared among all loop operations (e.g. `omp.simd`). This would possibly also help in the addition of support for compound constructs later on. This first approach is only intended to keep things running after the transition to loop wrappers and not to add support for other use cases enabled by that transition.

This PR on its own will not pass premerge tests. All patches in the stack are needed before it can be compiled and passes tests.

>From 25dc3a45645ab2310606b2ab02346eed700c7f97 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof at amd.com>
Date: Thu, 18 Apr 2024 11:07:10 +0100
Subject: [PATCH] [MLIR][OpenMP] Update omp.wsloop translation to LLVM IR (4/5)

This patch introduces minimal changes to the MLIR to LLVM IR translation of
omp.wsloop to support the loop wrapper approach.

There is `omp.loop_nest` related translation code that should be extracted and
shared among all loop operations (e.g. `omp.simd`). This would possibly also
help in the addition of support for compound constructs later on. This first
approach is only intended to keep things running after the transition to loop
wrappers and not to add support for other use cases enabled by that transition.

This PR on its own will not pass premerge tests. All patches in the stack are
needed before it can be compiled and passes tests.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  68 +-
 .../LLVMIR/omptarget-parallel-wsloop.mlir     |  11 +-
 .../LLVMIR/omptarget-wsloop-collapsed.mlir    |  17 +-
 mlir/test/Target/LLVMIR/omptarget-wsloop.mlir |  18 +-
 .../LLVMIR/openmp-data-target-device.mlir     |  31 +-
 mlir/test/Target/LLVMIR/openmp-llvm.mlir      | 741 ++++++++++--------
 mlir/test/Target/LLVMIR/openmp-nested.mlir    |  30 +-
 mlir/test/Target/LLVMIR/openmp-reduction.mlir | 113 +--
 .../openmp-wsloop-reduction-cleanup.mlir      |   9 +-
 9 files changed, 572 insertions(+), 466 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index e89ff9209b034a..22d6462b881dc0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -916,35 +916,37 @@ static LogicalResult inlineReductionCleanup(
 static LogicalResult
 convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
-  auto loop = cast<omp::WsloopOp>(opInst);
-  const bool isByRef = loop.getByref();
+  auto wsloopOp = cast<omp::WsloopOp>(opInst);
+  auto loopOp = cast<omp::LoopNestOp>(wsloopOp.getWrappedLoop());
+  const bool isByRef = wsloopOp.getByref();
+
   // TODO: this should be in the op verifier instead.
-  if (loop.getLowerBound().empty())
+  if (loopOp.getLowerBound().empty())
     return failure();
 
   // Static is the default.
   auto schedule =
-      loop.getScheduleVal().value_or(omp::ClauseScheduleKind::Static);
+      wsloopOp.getScheduleVal().value_or(omp::ClauseScheduleKind::Static);
 
   // Find the loop configuration.
-  llvm::Value *step = moduleTranslation.lookupValue(loop.getStep()[0]);
+  llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[0]);
   llvm::Type *ivType = step->getType();
   llvm::Value *chunk = nullptr;
-  if (loop.getScheduleChunkVar()) {
+  if (wsloopOp.getScheduleChunkVar()) {
     llvm::Value *chunkVar =
-        moduleTranslation.lookupValue(loop.getScheduleChunkVar());
+        moduleTranslation.lookupValue(wsloopOp.getScheduleChunkVar());
     chunk = builder.CreateSExtOrTrunc(chunkVar, ivType);
   }
 
   SmallVector<omp::DeclareReductionOp> reductionDecls;
-  collectReductionDecls(loop, reductionDecls);
+  collectReductionDecls(wsloopOp, reductionDecls);
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
 
   SmallVector<llvm::Value *> privateReductionVariables;
   DenseMap<Value, llvm::Value *> reductionVariableMap;
   if (!isByRef) {
-    allocByValReductionVars(loop, builder, moduleTranslation, allocaIP,
+    allocByValReductionVars(wsloopOp, builder, moduleTranslation, allocaIP,
                             reductionDecls, privateReductionVariables,
                             reductionVariableMap);
   }
@@ -952,13 +954,12 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   // Before the loop, store the initial values of reductions into reduction
   // variables. Although this could be done after allocas, we don't want to mess
   // up with the alloca insertion point.
-  MutableArrayRef<BlockArgument> reductionArgs =
-      loop.getRegion().getArguments().take_back(loop.getNumReductionVars());
-  for (unsigned i = 0; i < loop.getNumReductionVars(); ++i) {
+  ArrayRef<BlockArgument> reductionArgs = wsloopOp.getRegion().getArguments();
+  for (unsigned i = 0; i < wsloopOp.getNumReductionVars(); ++i) {
     SmallVector<llvm::Value *> phis;
 
     // map block argument to initializer region
-    mapInitializationArg(loop, moduleTranslation, reductionDecls, i);
+    mapInitializationArg(wsloopOp, moduleTranslation, reductionDecls, i);
 
     if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
                                        "omp.reduction.neutral", builder,
@@ -977,7 +978,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
 
       privateReductionVariables.push_back(var);
       moduleTranslation.mapValue(reductionArgs[i], phis[0]);
-      reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]);
+      reductionVariableMap.try_emplace(wsloopOp.getReductionVars()[i], phis[0]);
     } else {
       // for by-ref case the store is inside of the reduction region
       builder.CreateStore(phis[0], privateReductionVariables[i]);
@@ -1008,33 +1009,34 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   auto bodyGen = [&](llvm::OpenMPIRBuilder::InsertPointTy ip, llvm::Value *iv) {
     // Make sure further conversions know about the induction variable.
     moduleTranslation.mapValue(
-        loop.getRegion().front().getArgument(loopInfos.size()), iv);
+        loopOp.getRegion().front().getArgument(loopInfos.size()), iv);
 
     // Capture the body insertion point for use in nested loops. BodyIP of the
     // CanonicalLoopInfo always points to the beginning of the entry block of
     // the body.
     bodyInsertPoints.push_back(ip);
 
-    if (loopInfos.size() != loop.getNumLoops() - 1)
+    if (loopInfos.size() != loopOp.getNumLoops() - 1)
       return;
 
     // Convert the body of the loop.
     builder.restoreIP(ip);
-    convertOmpOpRegions(loop.getRegion(), "omp.wsloop.region", builder,
+    convertOmpOpRegions(loopOp.getRegion(), "omp.wsloop.region", builder,
                         moduleTranslation, bodyGenStatus);
   };
 
   // Delegate actual loop construction to the OpenMP IRBuilder.
-  // TODO: this currently assumes Wsloop is semantically similar to SCF loop,
-  // i.e. it has a positive step, uses signed integer semantics. Reconsider
-  // this code when Wsloop clearly supports more cases.
+  // TODO: this currently assumes omp.loop_nest is semantically similar to SCF
+  // loop, i.e. it has a positive step, uses signed integer semantics.
+  // Reconsider this code when the nested loop operation clearly supports more
+  // cases.
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  for (unsigned i = 0, e = loop.getNumLoops(); i < e; ++i) {
+  for (unsigned i = 0, e = loopOp.getNumLoops(); i < e; ++i) {
     llvm::Value *lowerBound =
-        moduleTranslation.lookupValue(loop.getLowerBound()[i]);
+        moduleTranslation.lookupValue(loopOp.getLowerBound()[i]);
     llvm::Value *upperBound =
-        moduleTranslation.lookupValue(loop.getUpperBound()[i]);
-    llvm::Value *step = moduleTranslation.lookupValue(loop.getStep()[i]);
+        moduleTranslation.lookupValue(loopOp.getUpperBound()[i]);
+    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[i]);
 
     // Make sure loop trip count are emitted in the preheader of the outermost
     // loop at the latest so that they are all available for the new collapsed
@@ -1047,7 +1049,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
     }
     loopInfos.push_back(ompBuilder->createCanonicalLoop(
         loc, bodyGen, lowerBound, upperBound, step,
-        /*IsSigned=*/true, loop.getInclusive(), computeIP));
+        /*IsSigned=*/true, loopOp.getInclusive(), computeIP));
 
     if (failed(bodyGenStatus))
       return failure();
@@ -1062,13 +1064,13 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
 
   // TODO: Handle doacross loops when the ordered clause has a parameter.
-  bool isOrdered = loop.getOrderedVal().has_value();
+  bool isOrdered = wsloopOp.getOrderedVal().has_value();
   std::optional<omp::ScheduleModifier> scheduleModifier =
-      loop.getScheduleModifier();
-  bool isSimd = loop.getSimdModifier();
+      wsloopOp.getScheduleModifier();
+  bool isSimd = wsloopOp.getSimdModifier();
 
   ompBuilder->applyWorkshareLoop(
-      ompLoc.DL, loopInfo, allocaIP, !loop.getNowait(),
+      ompLoc.DL, loopInfo, allocaIP, !wsloopOp.getNowait(),
       convertToScheduleKind(schedule), chunk, isSimd,
       scheduleModifier == omp::ScheduleModifier::monotonic,
       scheduleModifier == omp::ScheduleModifier::nonmonotonic, isOrdered);
@@ -1080,7 +1082,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   builder.restoreIP(afterIP);
 
   // Process the reductions if required.
-  if (loop.getNumReductionVars() == 0)
+  if (wsloopOp.getNumReductionVars() == 0)
     return success();
 
   // Create the reduction generators. We need to own them here because
@@ -1088,7 +1090,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   SmallVector<OwningReductionGen> owningReductionGens;
   SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens;
   SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;
-  collectReductionInfo(loop, builder, moduleTranslation, reductionDecls,
+  collectReductionInfo(wsloopOp, builder, moduleTranslation, reductionDecls,
                        owningReductionGens, owningAtomicReductionGens,
                        privateReductionVariables, reductionInfos);
 
@@ -1099,9 +1101,9 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   builder.SetInsertPoint(tempTerminator);
   llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
       ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos,
-                                   loop.getNowait(), isByRef);
+                                   wsloopOp.getNowait(), isByRef);
   if (!contInsertPoint.getBlock())
-    return loop->emitOpError() << "failed to convert reductions";
+    return wsloopOp->emitOpError() << "failed to convert reductions";
   auto nextInsertionPoint =
       ompBuilder->createBarrier(contInsertPoint, llvm::omp::OMPD_for);
   tempTerminator->eraseFromParent();
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
index b0fe642238f14f..360b3b0c0e60c1 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
@@ -12,10 +12,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
       %loop_ub = llvm.mlir.constant(9 : i32) : i32
       %loop_lb = llvm.mlir.constant(0 : i32) : i32
       %loop_step = llvm.mlir.constant(1 : i32) : i32
-      omp.wsloop for  (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
-        %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
-        llvm.store %loop_cnt, %gep : i32, !llvm.ptr
-        omp.yield
+      omp.wsloop {
+        omp.loop_nest (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+          %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+          llvm.store %loop_cnt, %gep : i32, !llvm.ptr
+          omp.yield
+        }
+        omp.terminator
       }
      omp.terminator
     }
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
index 0d77423abcb4f1..13d34b7e58f77e 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
@@ -8,13 +8,16 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
     %loop_ub = llvm.mlir.constant(99 : i32) : i32
     %loop_lb = llvm.mlir.constant(0 : i32) : i32
     %loop_step = llvm.mlir.constant(1 : index) : i32
-    omp.wsloop for  (%arg1, %arg2) : i32 = (%loop_lb, %loop_lb) to (%loop_ub, %loop_ub) inclusive step (%loop_step, %loop_step) {
-      %1 = llvm.add %arg1, %arg2  : i32
-      %2 = llvm.mul %arg2, %loop_ub overflow<nsw>  : i32
-      %3 = llvm.add %arg1, %2 :i32
-      %4 = llvm.getelementptr %arg0[%3] : (!llvm.ptr, i32) -> !llvm.ptr, i32
-      llvm.store %1, %4 : i32, !llvm.ptr
-      omp.yield
+    omp.wsloop {
+      omp.loop_nest (%arg1, %arg2) : i32 = (%loop_lb, %loop_lb) to (%loop_ub, %loop_ub) inclusive step (%loop_step, %loop_step) {
+        %1 = llvm.add %arg1, %arg2  : i32
+        %2 = llvm.mul %arg2, %loop_ub overflow<nsw>  : i32
+        %3 = llvm.add %arg1, %2 :i32
+        %4 = llvm.getelementptr %arg0[%3] : (!llvm.ptr, i32) -> !llvm.ptr, i32
+        llvm.store %1, %4 : i32, !llvm.ptr
+        omp.yield
+      }
+      omp.terminator
     }
     llvm.return
   }
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
index 0f3f503dfa5377..ee851eaf71ac0b 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -8,10 +8,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
       %loop_ub = llvm.mlir.constant(9 : i32) : i32
       %loop_lb = llvm.mlir.constant(0 : i32) : i32
       %loop_step = llvm.mlir.constant(1 : i32) : i32
-      omp.wsloop for  (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
-        %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
-        llvm.store %loop_cnt, %gep : i32, !llvm.ptr
-        omp.yield
+      omp.wsloop {
+        omp.loop_nest (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+          %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+          llvm.store %loop_cnt, %gep : i32, !llvm.ptr
+          omp.yield
+        }
+        omp.terminator
       }
     llvm.return
   }
@@ -20,8 +23,11 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
       %loop_ub = llvm.mlir.constant(9 : i32) : i32
       %loop_lb = llvm.mlir.constant(0 : i32) : i32
       %loop_step = llvm.mlir.constant(1 : i32) : i32
-      omp.wsloop for  (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
-        omp.yield
+      omp.wsloop {
+        omp.loop_nest (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+          omp.yield
+        }
+        omp.terminator
       }
     llvm.return
   }
diff --git a/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir b/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
index d41429a6de066f..4ea9df369af66c 100644
--- a/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
@@ -31,20 +31,23 @@ module attributes { } {
           %18 = llvm.mlir.constant(1 : i64) : i64
           %19 = llvm.alloca %18 x i32 {pinned} : (i64) -> !llvm.ptr<5>
           %20 = llvm.addrspacecast %19 : !llvm.ptr<5> to !llvm.ptr
-          omp.wsloop  for  (%arg2) : i32 = (%16) to (%15) inclusive step (%16) {
-            llvm.store %arg2, %20 : i32, !llvm.ptr
-            %21 = llvm.load %20 : !llvm.ptr -> i32
-            %22 = llvm.sext %21 : i32 to i64
-            %23 = llvm.mlir.constant(1 : i64) : i64
-            %24 = llvm.mlir.constant(0 : i64) : i64
-            %25 = llvm.sub %22, %23 overflow<nsw>  : i64
-            %26 = llvm.mul %25, %23 overflow<nsw>  : i64
-            %27 = llvm.mul %26, %23 overflow<nsw>  : i64
-            %28 = llvm.add %27, %24 overflow<nsw>  : i64
-            %29 = llvm.mul %23, %17 overflow<nsw>  : i64
-            %30 = llvm.getelementptr %arg0[%28] : (!llvm.ptr, i64) -> !llvm.ptr, i32
-            llvm.store %21, %30 : i32, !llvm.ptr
-            omp.yield
+          omp.wsloop {
+            omp.loop_nest (%arg2) : i32 = (%16) to (%15) inclusive step (%16) {
+              llvm.store %arg2, %20 : i32, !llvm.ptr
+              %21 = llvm.load %20 : !llvm.ptr -> i32
+              %22 = llvm.sext %21 : i32 to i64
+              %23 = llvm.mlir.constant(1 : i64) : i64
+              %24 = llvm.mlir.constant(0 : i64) : i64
+              %25 = llvm.sub %22, %23 overflow<nsw>  : i64
+              %26 = llvm.mul %25, %23 overflow<nsw>  : i64
+              %27 = llvm.mul %26, %23 overflow<nsw>  : i64
+              %28 = llvm.add %27, %24 overflow<nsw>  : i64
+              %29 = llvm.mul %23, %17 overflow<nsw>  : i64
+              %30 = llvm.getelementptr %arg0[%28] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+              llvm.store %21, %30 : i32, !llvm.ptr
+              omp.yield
+            }
+            omp.terminator
           }
           omp.terminator
         }
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index d1390022c1dc44..ad40ca26bec9f8 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -320,18 +320,20 @@ llvm.func @wsloop_simple(%arg0: !llvm.ptr) {
   %1 = llvm.mlir.constant(10 : index) : i64
   %2 = llvm.mlir.constant(1 : index) : i64
   omp.parallel {
-    "omp.wsloop"(%1, %0, %2) ({
-    ^bb0(%arg1: i64):
-      // The form of the emitted IR is controlled by OpenMPIRBuilder and
-      // tested there. Just check that the right functions are called.
-      // CHECK: call i32 @__kmpc_global_thread_num
-      // CHECK: call void @__kmpc_for_static_init_{{.*}}(ptr @[[$loc_struct]],
-      %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
-      %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %3, %4 : f32, !llvm.ptr
-      omp.yield
+    "omp.wsloop"() ({
+      omp.loop_nest (%arg1) : i64 = (%1) to (%0) step (%2) {
+        // The form of the emitted IR is controlled by OpenMPIRBuilder and
+        // tested there. Just check that the right functions are called.
+        // CHECK: call i32 @__kmpc_global_thread_num
+        // CHECK: call void @__kmpc_for_static_init_{{.*}}(ptr @[[$loc_struct]],
+        %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+        %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+        llvm.store %3, %4 : f32, !llvm.ptr
+        omp.yield
+      }
+      omp.terminator
       // CHECK: call void @__kmpc_for_static_fini(ptr @[[$loc_struct]],
-    }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
+    }) : () -> ()
     omp.terminator
   }
   llvm.return
@@ -345,13 +347,15 @@ llvm.func @wsloop_inclusive_1(%arg0: !llvm.ptr) {
   %1 = llvm.mlir.constant(10 : index) : i64
   %2 = llvm.mlir.constant(1 : index) : i64
   // CHECK: store i64 31, ptr %{{.*}}upperbound
-  "omp.wsloop"(%1, %0, %2) ({
-  ^bb0(%arg1: i64):
-    %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
-    %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-    llvm.store %3, %4 : f32, !llvm.ptr
-    omp.yield
-  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
+  "omp.wsloop"() ({
+    omp.loop_nest (%arg1) : i64 = (%1) to (%0) step (%2) {
+      %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+      %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %3, %4 : f32, !llvm.ptr
+      omp.yield
+    }
+    omp.terminator
+  }) : () -> ()
   llvm.return
 }
 
@@ -363,13 +367,15 @@ llvm.func @wsloop_inclusive_2(%arg0: !llvm.ptr) {
   %1 = llvm.mlir.constant(10 : index) : i64
   %2 = llvm.mlir.constant(1 : index) : i64
   // CHECK: store i64 32, ptr %{{.*}}upperbound
-  "omp.wsloop"(%1, %0, %2) ({
-  ^bb0(%arg1: i64):
-    %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
-    %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-    llvm.store %3, %4 : f32, !llvm.ptr
-    omp.yield
-  }) {inclusive, operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0, 0>} : (i64, i64, i64) -> ()
+  "omp.wsloop"() ({
+    omp.loop_nest (%arg1) : i64 = (%1) to (%0) inclusive step (%2) {
+      %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+      %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %3, %4 : f32, !llvm.ptr
+      omp.yield
+    }
+    omp.terminator
+  }) : () -> ()
   llvm.return
 }
 
@@ -379,14 +385,16 @@ llvm.func @body(i32)
 
 // CHECK-LABEL: @test_omp_wsloop_static_defchunk
 llvm.func @test_omp_wsloop_static_defchunk(%lb : i32, %ub : i32, %step : i32) -> () {
- omp.wsloop schedule(static)
- for (%iv) : i32 = (%lb) to (%ub) step (%step) {
-   // CHECK: call void @__kmpc_for_static_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 34, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, i32 1, i32 0)
-   // CHECK: call void @__kmpc_for_static_fini
-   llvm.call @body(%iv) : (i32) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(static) {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_for_static_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 34, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, i32 1, i32 0)
+      // CHECK: call void @__kmpc_for_static_fini
+      llvm.call @body(%iv) : (i32) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -395,15 +403,17 @@ llvm.func @body(i32)
 
 // CHECK-LABEL: @test_omp_wsloop_static_1
 llvm.func @test_omp_wsloop_static_1(%lb : i32, %ub : i32, %step : i32) -> () {
- %static_chunk_size = llvm.mlir.constant(1 : i32) : i32
- omp.wsloop schedule(static = %static_chunk_size : i32)
- for (%iv) : i32 = (%lb) to (%ub) step (%step) {
-   // CHECK: call void @__kmpc_for_static_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 33, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, i32 1, i32 1)
-   // CHECK: call void @__kmpc_for_static_fini
-   llvm.call @body(%iv) : (i32) -> ()
-   omp.yield
- }
- llvm.return
+  %static_chunk_size = llvm.mlir.constant(1 : i32) : i32
+  omp.wsloop schedule(static = %static_chunk_size : i32) {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_for_static_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 33, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, i32 1, i32 1)
+      // CHECK: call void @__kmpc_for_static_fini
+      llvm.call @body(%iv) : (i32) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -412,15 +422,17 @@ llvm.func @body(i32)
 
 // CHECK-LABEL: @test_omp_wsloop_static_2
 llvm.func @test_omp_wsloop_static_2(%lb : i32, %ub : i32, %step : i32) -> () {
- %static_chunk_size = llvm.mlir.constant(2 : i32) : i32
- omp.wsloop schedule(static = %static_chunk_size : i32)
- for (%iv) : i32 = (%lb) to (%ub) step (%step) {
-   // CHECK: call void @__kmpc_for_static_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 33, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, i32 1, i32 2)
-   // CHECK: call void @__kmpc_for_static_fini
-   llvm.call @body(%iv) : (i32) -> ()
-   omp.yield
- }
- llvm.return
+  %static_chunk_size = llvm.mlir.constant(2 : i32) : i32
+  omp.wsloop schedule(static = %static_chunk_size : i32) {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_for_static_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 33, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, i32 1, i32 2)
+      // CHECK: call void @__kmpc_for_static_fini
+      llvm.call @body(%iv) : (i32) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -428,16 +440,18 @@ llvm.func @test_omp_wsloop_static_2(%lb : i32, %ub : i32, %step : i32) -> () {
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_dynamic(%lb : i64, %ub : i64, %step : i64) -> () {
- omp.wsloop schedule(dynamic)
- for (%iv) : i64 = (%lb) to (%ub) step (%step)  {
-  // CHECK: call void @__kmpc_dispatch_init_8u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK: br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(dynamic) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step)  {
+      // CHECK: call void @__kmpc_dispatch_init_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK: br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -445,17 +459,19 @@ llvm.func @test_omp_wsloop_dynamic(%lb : i64, %ub : i64, %step : i64) -> () {
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_dynamic_chunk_const(%lb : i64, %ub : i64, %step : i64) -> () {
- %chunk_size_const = llvm.mlir.constant(2 : i16) : i16
- omp.wsloop schedule(dynamic = %chunk_size_const : i16)
- for (%iv) : i64 = (%lb) to (%ub) step (%step)  {
-  // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741859, i64 {{.*}}, i64 %{{.*}}, i64 {{.*}}, i64 2)
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK: br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  %chunk_size_const = llvm.mlir.constant(2 : i16) : i16
+  omp.wsloop schedule(dynamic = %chunk_size_const : i16) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step)  {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741859, i64 {{.*}}, i64 %{{.*}}, i64 {{.*}}, i64 2)
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK: br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -463,20 +479,22 @@ llvm.func @test_omp_wsloop_dynamic_chunk_const(%lb : i64, %ub : i64, %step : i64
 llvm.func @body(i32)
 
 llvm.func @test_omp_wsloop_dynamic_chunk_var(%lb : i32, %ub : i32, %step : i32) -> () {
- %1 = llvm.mlir.constant(1 : i64) : i64
- %chunk_size_alloca = llvm.alloca %1 x i16 {bindc_name = "chunk_size", in_type = i16, uniq_name = "_QFsub1Echunk_size"} : (i64) -> !llvm.ptr
- %chunk_size_var = llvm.load %chunk_size_alloca : !llvm.ptr -> i16
- omp.wsloop schedule(dynamic = %chunk_size_var : i16)
- for (%iv) : i32 = (%lb) to (%ub) step (%step) {
-  // CHECK: %[[CHUNK_SIZE:.*]] = sext i16 %{{.*}} to i32
-  // CHECK: call void @__kmpc_dispatch_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741859, i32 {{.*}}, i32 %{{.*}}, i32 {{.*}}, i32 %[[CHUNK_SIZE]])
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_4u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK: br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i32) -> ()
-   omp.yield
- }
- llvm.return
+  %1 = llvm.mlir.constant(1 : i64) : i64
+  %chunk_size_alloca = llvm.alloca %1 x i16 {bindc_name = "chunk_size", in_type = i16, uniq_name = "_QFsub1Echunk_size"} : (i64) -> !llvm.ptr
+  %chunk_size_var = llvm.load %chunk_size_alloca : !llvm.ptr -> i16
+  omp.wsloop schedule(dynamic = %chunk_size_var : i16) {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      // CHECK: %[[CHUNK_SIZE:.*]] = sext i16 %{{.*}} to i32
+      // CHECK: call void @__kmpc_dispatch_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741859, i32 {{.*}}, i32 %{{.*}}, i32 {{.*}}, i32 %[[CHUNK_SIZE]])
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_4u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK: br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i32) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -484,20 +502,22 @@ llvm.func @test_omp_wsloop_dynamic_chunk_var(%lb : i32, %ub : i32, %step : i32)
 llvm.func @body(i32)
 
 llvm.func @test_omp_wsloop_dynamic_chunk_var2(%lb : i32, %ub : i32, %step : i32) -> () {
- %1 = llvm.mlir.constant(1 : i64) : i64
- %chunk_size_alloca = llvm.alloca %1 x i64 {bindc_name = "chunk_size", in_type = i64, uniq_name = "_QFsub1Echunk_size"} : (i64) -> !llvm.ptr
- %chunk_size_var = llvm.load %chunk_size_alloca : !llvm.ptr -> i64
- omp.wsloop schedule(dynamic = %chunk_size_var : i64)
- for (%iv) : i32 = (%lb) to (%ub) step (%step) {
-  // CHECK: %[[CHUNK_SIZE:.*]] = trunc i64 %{{.*}} to i32
-  // CHECK: call void @__kmpc_dispatch_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741859, i32 {{.*}}, i32 %{{.*}}, i32 {{.*}}, i32 %[[CHUNK_SIZE]])
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_4u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK: br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i32) -> ()
-   omp.yield
- }
- llvm.return
+  %1 = llvm.mlir.constant(1 : i64) : i64
+  %chunk_size_alloca = llvm.alloca %1 x i64 {bindc_name = "chunk_size", in_type = i64, uniq_name = "_QFsub1Echunk_size"} : (i64) -> !llvm.ptr
+  %chunk_size_var = llvm.load %chunk_size_alloca : !llvm.ptr -> i64
+  omp.wsloop schedule(dynamic = %chunk_size_var : i64) {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      // CHECK: %[[CHUNK_SIZE:.*]] = trunc i64 %{{.*}} to i32
+      // CHECK: call void @__kmpc_dispatch_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741859, i32 {{.*}}, i32 %{{.*}}, i32 {{.*}}, i32 %[[CHUNK_SIZE]])
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_4u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK: br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i32) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -505,16 +525,18 @@ llvm.func @test_omp_wsloop_dynamic_chunk_var2(%lb : i32, %ub : i32, %step : i32)
 llvm.func @body(i32)
 
 llvm.func @test_omp_wsloop_dynamic_chunk_var3(%lb : i32, %ub : i32, %step : i32, %chunk_size : i32) -> () {
- omp.wsloop schedule(dynamic = %chunk_size : i32)
- for (%iv) : i32 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741859, i32 {{.*}}, i32 %{{.*}}, i32 {{.*}}, i32 %{{.*}})
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_4u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK: br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i32) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(dynamic = %chunk_size : i32) {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741859, i32 {{.*}}, i32 %{{.*}}, i32 {{.*}}, i32 %{{.*}})
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_4u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK: br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i32) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -522,16 +544,18 @@ llvm.func @test_omp_wsloop_dynamic_chunk_var3(%lb : i32, %ub : i32, %step : i32,
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_auto(%lb : i64, %ub : i64, %step : i64) -> () {
- omp.wsloop schedule(auto)
- for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_8u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(auto) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -539,14 +563,16 @@ llvm.func @test_omp_wsloop_auto(%lb : i64, %ub : i64, %step : i64) -> () {
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_runtime(%lb : i64, %ub : i64, %step : i64) -> () {
-  omp.wsloop schedule(runtime)
-  for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-    // CHECK: call void @__kmpc_dispatch_init_8u
-    // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-    // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-    // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-    llvm.call @body(%iv) : (i64) -> ()
-    omp.yield
+  omp.wsloop schedule(runtime) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
   }
   llvm.return
 }
@@ -556,14 +582,16 @@ llvm.func @test_omp_wsloop_runtime(%lb : i64, %ub : i64, %step : i64) -> () {
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_guided(%lb : i64, %ub : i64, %step : i64) -> () {
-  omp.wsloop schedule(guided)
-  for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-    // CHECK: call void @__kmpc_dispatch_init_8u
-    // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-    // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-    // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-    llvm.call @body(%iv) : (i64) -> ()
-    omp.yield
+  omp.wsloop schedule(guided) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
   }
   llvm.return
 }
@@ -573,14 +601,16 @@ llvm.func @test_omp_wsloop_guided(%lb : i64, %ub : i64, %step : i64) -> () {
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_dynamic_nonmonotonic(%lb : i64, %ub : i64, %step : i64) -> () {
-  omp.wsloop schedule(dynamic, nonmonotonic)
-  for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-    // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741859
-    // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-    // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-    // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-    llvm.call @body(%iv) : (i64) -> ()
-    omp.yield
+  omp.wsloop schedule(dynamic, nonmonotonic) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741859
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
   }
   llvm.return
 }
@@ -590,14 +620,16 @@ llvm.func @test_omp_wsloop_dynamic_nonmonotonic(%lb : i64, %ub : i64, %step : i6
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_dynamic_monotonic(%lb : i64, %ub : i64, %step : i64) -> () {
-  omp.wsloop schedule(dynamic, monotonic)
-  for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-    // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 536870947
-    // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-    // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-    // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-    llvm.call @body(%iv) : (i64) -> ()
-    omp.yield
+  omp.wsloop schedule(dynamic, monotonic) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 536870947
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
   }
   llvm.return
 }
@@ -607,14 +639,16 @@ llvm.func @test_omp_wsloop_dynamic_monotonic(%lb : i64, %ub : i64, %step : i64)
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_runtime_simd(%lb : i64, %ub : i64, %step : i64) -> () {
-  omp.wsloop schedule(runtime, simd)
-  for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-    // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741871
-    // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-    // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-    // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-    llvm.call @body(%iv) : (i64) -> ()
-    omp.yield
+  omp.wsloop schedule(runtime, simd) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741871
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
   }
   llvm.return
 }
@@ -624,14 +658,16 @@ llvm.func @test_omp_wsloop_runtime_simd(%lb : i64, %ub : i64, %step : i64) -> ()
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_guided_simd(%lb : i64, %ub : i64, %step : i64) -> () {
-  omp.wsloop schedule(guided, simd)
-  for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-    // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741870
-    // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-    // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-    // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-    llvm.call @body(%iv) : (i64) -> ()
-    omp.yield
+  omp.wsloop schedule(guided, simd) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741870
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
   }
   llvm.return
 }
@@ -793,17 +829,19 @@ llvm.func @simd_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr {fi
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_ordered(%lb : i64, %ub : i64, %step : i64) -> () {
- omp.wsloop ordered(0)
- for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 66, i64 1, i64 %{{.*}}, i64 1, i64 1)
-  // CHECK: call void @__kmpc_dispatch_fini_8u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop ordered(0) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 66, i64 1, i64 %{{.*}}, i64 1, i64 1)
+      // CHECK: call void @__kmpc_dispatch_fini_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -811,17 +849,19 @@ llvm.func @test_omp_wsloop_ordered(%lb : i64, %ub : i64, %step : i64) -> () {
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_static_ordered(%lb : i64, %ub : i64, %step : i64) -> () {
- omp.wsloop schedule(static) ordered(0)
- for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 66, i64 1, i64 %{{.*}}, i64 1, i64 1)
-  // CHECK: call void @__kmpc_dispatch_fini_8u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(static) ordered(0) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 66, i64 1, i64 %{{.*}}, i64 1, i64 1)
+      // CHECK: call void @__kmpc_dispatch_fini_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -829,18 +869,20 @@ llvm.func @test_omp_wsloop_static_ordered(%lb : i64, %ub : i64, %step : i64) ->
 llvm.func @body(i32)
 
 llvm.func @test_omp_wsloop_static_chunk_ordered(%lb : i32, %ub : i32, %step : i32) -> () {
- %static_chunk_size = llvm.mlir.constant(1 : i32) : i32
- omp.wsloop schedule(static = %static_chunk_size : i32) ordered(0)
- for (%iv) : i32 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 65, i32 1, i32 %{{.*}}, i32 1, i32 1)
-  // CHECK: call void @__kmpc_dispatch_fini_4u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_4u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i32) -> ()
-   omp.yield
- }
- llvm.return
+  %static_chunk_size = llvm.mlir.constant(1 : i32) : i32
+  omp.wsloop schedule(static = %static_chunk_size : i32) ordered(0) {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_4u(ptr @{{.*}}, i32 %{{.*}}, i32 65, i32 1, i32 %{{.*}}, i32 1, i32 1)
+      // CHECK: call void @__kmpc_dispatch_fini_4u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_4u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i32) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -848,17 +890,19 @@ llvm.func @test_omp_wsloop_static_chunk_ordered(%lb : i32, %ub : i32, %step : i3
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_dynamic_ordered(%lb : i64, %ub : i64, %step : i64) -> () {
- omp.wsloop schedule(dynamic) ordered(0)
- for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 67, i64 1, i64 %{{.*}}, i64 1, i64 1)
-  // CHECK: call void @__kmpc_dispatch_fini_8u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(dynamic) ordered(0) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 67, i64 1, i64 %{{.*}}, i64 1, i64 1)
+      // CHECK: call void @__kmpc_dispatch_fini_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -866,17 +910,19 @@ llvm.func @test_omp_wsloop_dynamic_ordered(%lb : i64, %ub : i64, %step : i64) ->
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_auto_ordered(%lb : i64, %ub : i64, %step : i64) -> () {
- omp.wsloop schedule(auto) ordered(0)
- for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 70, i64 1, i64 %{{.*}}, i64 1, i64 1)
-  // CHECK: call void @__kmpc_dispatch_fini_8u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(auto) ordered(0) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 70, i64 1, i64 %{{.*}}, i64 1, i64 1)
+      // CHECK: call void @__kmpc_dispatch_fini_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -884,17 +930,19 @@ llvm.func @test_omp_wsloop_auto_ordered(%lb : i64, %ub : i64, %step : i64) -> ()
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_runtime_ordered(%lb : i64, %ub : i64, %step : i64) -> () {
- omp.wsloop schedule(runtime) ordered(0)
- for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 69, i64 1, i64 %{{.*}}, i64 1, i64 1)
-  // CHECK: call void @__kmpc_dispatch_fini_8u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(runtime) ordered(0) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 69, i64 1, i64 %{{.*}}, i64 1, i64 1)
+      // CHECK: call void @__kmpc_dispatch_fini_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -902,17 +950,19 @@ llvm.func @test_omp_wsloop_runtime_ordered(%lb : i64, %ub : i64, %step : i64) ->
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_guided_ordered(%lb : i64, %ub : i64, %step : i64) -> () {
- omp.wsloop schedule(guided) ordered(0)
- for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 68, i64 1, i64 %{{.*}}, i64 1, i64 1)
-  // CHECK: call void @__kmpc_dispatch_fini_8u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(guided) ordered(0) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 68, i64 1, i64 %{{.*}}, i64 1, i64 1)
+      // CHECK: call void @__kmpc_dispatch_fini_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -920,17 +970,19 @@ llvm.func @test_omp_wsloop_guided_ordered(%lb : i64, %ub : i64, %step : i64) ->
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_dynamic_nonmonotonic_ordered(%lb : i64, %ub : i64, %step : i64) -> () {
- omp.wsloop schedule(dynamic, nonmonotonic) ordered(0)
- for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741891, i64 1, i64 %{{.*}}, i64 1, i64 1)
-  // CHECK: call void @__kmpc_dispatch_fini_8u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(dynamic, nonmonotonic) ordered(0) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 1073741891, i64 1, i64 %{{.*}}, i64 1, i64 1)
+      // CHECK: call void @__kmpc_dispatch_fini_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -938,17 +990,19 @@ llvm.func @test_omp_wsloop_dynamic_nonmonotonic_ordered(%lb : i64, %ub : i64, %s
 llvm.func @body(i64)
 
 llvm.func @test_omp_wsloop_dynamic_monotonic_ordered(%lb : i64, %ub : i64, %step : i64) -> () {
- omp.wsloop schedule(dynamic, monotonic) ordered(0)
- for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-  // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 536870979, i64 1, i64 %{{.*}}, i64 1, i64 1)
-  // CHECK: call void @__kmpc_dispatch_fini_8u
-  // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
-  // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
-  // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
-   llvm.call @body(%iv) : (i64) -> ()
-   omp.yield
- }
- llvm.return
+  omp.wsloop schedule(dynamic, monotonic) ordered(0) {
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+      // CHECK: call void @__kmpc_dispatch_init_8u(ptr @{{.*}}, i32 %{{.*}}, i32 536870979, i64 1, i64 %{{.*}}, i64 1, i64 1)
+      // CHECK: call void @__kmpc_dispatch_fini_8u
+      // CHECK: %[[continue:.*]] = call i32 @__kmpc_dispatch_next_8u
+      // CHECK: %[[cond:.*]] = icmp ne i32 %[[continue]], 0
+      // CHECK  br i1 %[[cond]], label %omp_loop.header{{.*}}, label %omp_loop.exit{{.*}}
+      llvm.call @body(%iv) : (i64) -> ()
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
 }
 
 // -----
@@ -1114,14 +1168,16 @@ llvm.func @collapse_wsloop(
     // CHECK: %[[TOTAL_SUB_1:.*]] = sub i32 %[[TOTAL]], 1
     // CHECK: store i32 %[[TOTAL_SUB_1]], ptr
     // CHECK: call void @__kmpc_for_static_init_4u
-    omp.wsloop
-    for (%arg0, %arg1, %arg2) : i32 = (%0, %1, %2) to (%3, %4, %5) step (%6, %7, %8) {
-      %31 = llvm.load %20 : !llvm.ptr -> i32
-      %32 = llvm.add %31, %arg0 : i32
-      %33 = llvm.add %32, %arg1 : i32
-      %34 = llvm.add %33, %arg2 : i32
-      llvm.store %34, %20 : i32, !llvm.ptr
-      omp.yield
+    omp.wsloop {
+      omp.loop_nest (%arg0, %arg1, %arg2) : i32 = (%0, %1, %2) to (%3, %4, %5) step (%6, %7, %8) {
+        %31 = llvm.load %20 : !llvm.ptr -> i32
+        %32 = llvm.add %31, %arg0 : i32
+        %33 = llvm.add %32, %arg1 : i32
+        %34 = llvm.add %33, %arg2 : i32
+        llvm.store %34, %20 : i32, !llvm.ptr
+        omp.yield
+      }
+      omp.terminator
     }
     omp.terminator
   }
@@ -1175,14 +1231,16 @@ llvm.func @collapse_wsloop_dynamic(
     // CHECK: store i32 1, ptr
     // CHECK: store i32 %[[TOTAL]], ptr
     // CHECK: call void @__kmpc_dispatch_init_4u
-    omp.wsloop schedule(dynamic)
-    for (%arg0, %arg1, %arg2) : i32 = (%0, %1, %2) to (%3, %4, %5) step (%6, %7, %8) {
-      %31 = llvm.load %20 : !llvm.ptr -> i32
-      %32 = llvm.add %31, %arg0 : i32
-      %33 = llvm.add %32, %arg1 : i32
-      %34 = llvm.add %33, %arg2 : i32
-      llvm.store %34, %20 : i32, !llvm.ptr
-      omp.yield
+    omp.wsloop schedule(dynamic) {
+      omp.loop_nest (%arg0, %arg1, %arg2) : i32 = (%0, %1, %2) to (%3, %4, %5) step (%6, %7, %8) {
+        %31 = llvm.load %20 : !llvm.ptr -> i32
+        %32 = llvm.add %31, %arg0 : i32
+        %33 = llvm.add %32, %arg1 : i32
+        %34 = llvm.add %33, %arg2 : i32
+        llvm.store %34, %20 : i32, !llvm.ptr
+        omp.yield
+      }
+      omp.terminator
     }
     omp.terminator
   }
@@ -1207,63 +1265,69 @@ llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64,
   // CHECK: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[OMP_THREAD]])
   }
 
-  omp.wsloop ordered(0)
-  for (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) {
-    // CHECK:  call void @__kmpc_ordered(ptr @[[GLOB3:[0-9]+]], i32 [[OMP_THREAD2:%.*]])
-    omp.ordered.region  {
-      omp.terminator
-    // CHECK: call void @__kmpc_end_ordered(ptr @[[GLOB3]], i32 [[OMP_THREAD2]])
+  omp.wsloop ordered(0) {
+    omp.loop_nest (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) {
+      // CHECK:  call void @__kmpc_ordered(ptr @[[GLOB3:[0-9]+]], i32 [[OMP_THREAD2:%.*]])
+      omp.ordered.region  {
+        omp.terminator
+      // CHECK: call void @__kmpc_end_ordered(ptr @[[GLOB3]], i32 [[OMP_THREAD2]])
+      }
+      omp.yield
     }
-    omp.yield
+    omp.terminator
   }
 
-  omp.wsloop ordered(1)
-  for (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) {
-    // CHECK: [[TMP:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR]], i64 0, i64 0
-    // CHECK: store i64 [[ARG0:%.*]], ptr [[TMP]], align 8
-    // CHECK: [[TMP2:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR]], i64 0, i64 0
-    // CHECK: [[OMP_THREAD2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
-    // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB3]], i32 [[OMP_THREAD2]], ptr [[TMP2]])
-    omp.ordered depend_type(dependsink) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
+  omp.wsloop ordered(1) {
+    omp.loop_nest (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) {
+      // CHECK: [[TMP:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR]], i64 0, i64 0
+      // CHECK: store i64 [[ARG0:%.*]], ptr [[TMP]], align 8
+      // CHECK: [[TMP2:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR]], i64 0, i64 0
+      // CHECK: [[OMP_THREAD2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+      // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB3]], i32 [[OMP_THREAD2]], ptr [[TMP2]])
+      omp.ordered depend_type(dependsink) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
 
-    // CHECK: [[TMP3:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR3]], i64 0, i64 0
-    // CHECK: store i64 [[ARG0]], ptr [[TMP3]], align 8
-    // CHECK: [[TMP4:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR3]], i64 0, i64 0
-    // CHECK: [[OMP_THREAD4:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB5:[0-9]+]])
-    // CHECK: call void @__kmpc_doacross_post(ptr @[[GLOB5]], i32 [[OMP_THREAD4]], ptr [[TMP4]])
-    omp.ordered depend_type(dependsource) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
+      // CHECK: [[TMP3:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR3]], i64 0, i64 0
+      // CHECK: store i64 [[ARG0]], ptr [[TMP3]], align 8
+      // CHECK: [[TMP4:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR3]], i64 0, i64 0
+      // CHECK: [[OMP_THREAD4:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB5:[0-9]+]])
+      // CHECK: call void @__kmpc_doacross_post(ptr @[[GLOB5]], i32 [[OMP_THREAD4]], ptr [[TMP4]])
+      omp.ordered depend_type(dependsource) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
 
-    omp.yield
+      omp.yield
+    }
+    omp.terminator
   }
 
-  omp.wsloop ordered(2)
-  for (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) {
-    // CHECK: [[TMP5:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR5]], i64 0, i64 0
-    // CHECK: store i64 [[ARG0]], ptr [[TMP5]], align 8
-    // CHECK: [[TMP6:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR5]], i64 0, i64 1
-    // CHECK: store i64 [[ARG1:%.*]], ptr [[TMP6]], align 8
-    // CHECK: [[TMP7:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR5]], i64 0, i64 0
-    // CHECK: [[OMP_THREAD6:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7:[0-9]+]])
-    // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB7]], i32 [[OMP_THREAD6]], ptr [[TMP7]])
-    // CHECK: [[TMP8:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR7]], i64 0, i64 0
-    // CHECK: store i64 [[ARG2:%.*]], ptr [[TMP8]], align 8
-    // CHECK: [[TMP9:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR7]], i64 0, i64 1
-    // CHECK: store i64 [[ARG3:%.*]], ptr [[TMP9]], align 8
-    // CHECK: [[TMP10:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR7]], i64 0, i64 0
-    // CHECK: [[OMP_THREAD8:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7]])
-    // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB7]], i32 [[OMP_THREAD8]], ptr [[TMP10]])
-    omp.ordered depend_type(dependsink) depend_vec(%arg3, %arg4, %arg5, %arg6 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+  omp.wsloop ordered(2) {
+    omp.loop_nest (%arg7) : i32 = (%arg0) to (%arg1) step (%arg2) {
+      // CHECK: [[TMP5:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR5]], i64 0, i64 0
+      // CHECK: store i64 [[ARG0]], ptr [[TMP5]], align 8
+      // CHECK: [[TMP6:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR5]], i64 0, i64 1
+      // CHECK: store i64 [[ARG1:%.*]], ptr [[TMP6]], align 8
+      // CHECK: [[TMP7:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR5]], i64 0, i64 0
+      // CHECK: [[OMP_THREAD6:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7:[0-9]+]])
+      // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB7]], i32 [[OMP_THREAD6]], ptr [[TMP7]])
+      // CHECK: [[TMP8:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR7]], i64 0, i64 0
+      // CHECK: store i64 [[ARG2:%.*]], ptr [[TMP8]], align 8
+      // CHECK: [[TMP9:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR7]], i64 0, i64 1
+      // CHECK: store i64 [[ARG3:%.*]], ptr [[TMP9]], align 8
+      // CHECK: [[TMP10:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR7]], i64 0, i64 0
+      // CHECK: [[OMP_THREAD8:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7]])
+      // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB7]], i32 [[OMP_THREAD8]], ptr [[TMP10]])
+      omp.ordered depend_type(dependsink) depend_vec(%arg3, %arg4, %arg5, %arg6 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+
+      // CHECK: [[TMP11:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 0
+      // CHECK: store i64 [[ARG0]], ptr [[TMP11]], align 8
+      // CHECK: [[TMP12:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 1
+      // CHECK: store i64 [[ARG1]], ptr [[TMP12]], align 8
+      // CHECK: [[TMP13:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 0
+      // CHECK: [[OMP_THREAD10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB9:[0-9]+]])
+      // CHECK: call void @__kmpc_doacross_post(ptr @[[GLOB9]], i32 [[OMP_THREAD10]], ptr [[TMP13]])
+      omp.ordered depend_type(dependsource) depend_vec(%arg3, %arg4 : i64, i64) {num_loops_val = 2 : i64}
 
-    // CHECK: [[TMP11:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 0
-    // CHECK: store i64 [[ARG0]], ptr [[TMP11]], align 8
-    // CHECK: [[TMP12:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 1
-    // CHECK: store i64 [[ARG1]], ptr [[TMP12]], align 8
-    // CHECK: [[TMP13:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 0
-    // CHECK: [[OMP_THREAD10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB9:[0-9]+]])
-    // CHECK: call void @__kmpc_doacross_post(ptr @[[GLOB9]], i32 [[OMP_THREAD10]], ptr [[TMP13]])
-    omp.ordered depend_type(dependsource) depend_vec(%arg3, %arg4 : i64, i64) {num_loops_val = 2 : i64}
-
-    omp.yield
+      omp.yield
+    }
+    omp.terminator
   }
 
   llvm.return
@@ -2133,10 +2197,13 @@ llvm.func @omp_sections_with_clauses() -> () {
 // introduction mechanism itself is tested elsewhere.
 // CHECK-LABEL: @repeated_successor
 llvm.func @repeated_successor(%arg0: i64, %arg1: i64, %arg2: i64, %arg3: i1) {
-  omp.wsloop for (%arg4) : i64 = (%arg0) to (%arg1) step (%arg2)  {
-    llvm.cond_br %arg3, ^bb1(%arg0 : i64), ^bb1(%arg1 : i64)
-  ^bb1(%0: i64):  // 2 preds: ^bb0, ^bb0
-    omp.yield
+  omp.wsloop {
+    omp.loop_nest (%arg4) : i64 = (%arg0) to (%arg1) step (%arg2)  {
+      llvm.cond_br %arg3, ^bb1(%arg0 : i64), ^bb1(%arg1 : i64)
+    ^bb1(%0: i64):  // 2 preds: ^bb0, ^bb0
+      omp.yield
+    }
+    omp.terminator
   }
   llvm.return
 }
diff --git a/mlir/test/Target/LLVMIR/openmp-nested.mlir b/mlir/test/Target/LLVMIR/openmp-nested.mlir
index e1fdfdd24a3cb0..ce5f22f10d7dce 100644
--- a/mlir/test/Target/LLVMIR/openmp-nested.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-nested.mlir
@@ -11,20 +11,26 @@ module {
       %2 = llvm.mlir.constant(0 : index) : i64
       %4 = llvm.mlir.constant(0 : i32) : i32
       %12 = llvm.alloca %0 x i64 : (i64) -> !llvm.ptr
-      omp.wsloop for (%arg2) : i64 = (%2) to (%1) step (%0)  {
-        omp.parallel   {
-          omp.wsloop for (%arg3) : i64 = (%2) to (%0) step (%0)  {
-            llvm.store %2, %12 : i64, !llvm.ptr
-            omp.yield
+      omp.wsloop {
+        omp.loop_nest (%arg2) : i64 = (%2) to (%1) step (%0) {
+          omp.parallel {
+            omp.wsloop {
+              omp.loop_nest (%arg3) : i64 = (%2) to (%0) step (%0) {
+                llvm.store %2, %12 : i64, !llvm.ptr
+                omp.yield
+              }
+              omp.terminator
+            }
+            omp.terminator
           }
-          omp.terminator
+          %19 = llvm.load %12 : !llvm.ptr -> i64
+          %20 = llvm.trunc %19 : i64 to i32
+          %5 = llvm.mlir.addressof @str0 : !llvm.ptr
+          %6 = llvm.getelementptr %5[%4, %4] : (!llvm.ptr, i32, i32) -> !llvm.ptr, !llvm.array<29 x i8>
+          %21 = llvm.call @printf(%6, %20, %20) vararg(!llvm.func<i32 (ptr, ...)>): (!llvm.ptr, i32, i32) -> i32
+          omp.yield
         }
-        %19 = llvm.load %12 : !llvm.ptr -> i64
-        %20 = llvm.trunc %19 : i64 to i32
-        %5 = llvm.mlir.addressof @str0 : !llvm.ptr
-        %6 = llvm.getelementptr %5[%4, %4] : (!llvm.ptr, i32, i32) -> !llvm.ptr, !llvm.array<29 x i8>
-        %21 = llvm.call @printf(%6, %20, %20) vararg(!llvm.func<i32 (ptr, ...)>): (!llvm.ptr, i32, i32) -> i32
-        omp.yield
+        omp.terminator
       }
       omp.terminator
     }
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
index 39b64d71a2274b..bfdad8c19335e1 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
@@ -26,13 +26,15 @@ llvm.func @simple_reduction(%lb : i64, %ub : i64, %step : i64) {
   %c1 = llvm.mlir.constant(1 : i32) : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   omp.parallel {
-    omp.wsloop reduction(@add_f32 %0 -> %prv : !llvm.ptr)
-    for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-      %1 = llvm.mlir.constant(2.0 : f32) : f32
-      %2 = llvm.load %prv : !llvm.ptr -> f32
-      %3 = llvm.fadd %1, %2 : f32
-      llvm.store %3, %prv : f32, !llvm.ptr
-      omp.yield
+    omp.wsloop reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
+      omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+        %1 = llvm.mlir.constant(2.0 : f32) : f32
+        %2 = llvm.load %prv : !llvm.ptr -> f32
+        %3 = llvm.fadd %1, %2 : f32
+        llvm.store %3, %prv : f32, !llvm.ptr
+        omp.yield
+      }
+      omp.terminator
     }
     omp.terminator
   }
@@ -105,16 +107,18 @@ llvm.func @reuse_declaration(%lb : i64, %ub : i64, %step : i64) {
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   omp.parallel {
-    omp.wsloop reduction(@add_f32 %0 -> %prv0 : !llvm.ptr, @add_f32 %2 -> %prv1 : !llvm.ptr)
-    for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-      %1 = llvm.mlir.constant(2.0 : f32) : f32
-      %3 = llvm.load %prv0 : !llvm.ptr -> f32
-      %4 = llvm.fadd %3, %1 : f32
-      llvm.store %4, %prv0 : f32, !llvm.ptr
-      %5 = llvm.load %prv1 : !llvm.ptr -> f32
-      %6 = llvm.fadd %5, %1 : f32
-      llvm.store %6, %prv1 : f32, !llvm.ptr
-      omp.yield
+    omp.wsloop reduction(@add_f32 %0 -> %prv0 : !llvm.ptr, @add_f32 %2 -> %prv1 : !llvm.ptr) {
+      omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+        %1 = llvm.mlir.constant(2.0 : f32) : f32
+        %3 = llvm.load %prv0 : !llvm.ptr -> f32
+        %4 = llvm.fadd %3, %1 : f32
+        llvm.store %4, %prv0 : f32, !llvm.ptr
+        %5 = llvm.load %prv1 : !llvm.ptr -> f32
+        %6 = llvm.fadd %5, %1 : f32
+        llvm.store %6, %prv1 : f32, !llvm.ptr
+        omp.yield
+      }
+      omp.terminator
     }
     omp.terminator
   }
@@ -195,13 +199,15 @@ llvm.func @missing_omp_reduction(%lb : i64, %ub : i64, %step : i64) {
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   omp.parallel {
-    omp.wsloop reduction(@add_f32 %0 -> %prv0 : !llvm.ptr, @add_f32 %2 -> %prv1 : !llvm.ptr)
-    for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-      %1 = llvm.mlir.constant(2.0 : f32) : f32
-      %3 = llvm.load %prv0 : !llvm.ptr -> f32
-      %4 = llvm.fadd %3, %1 : f32
-      llvm.store %4, %prv0 : f32, !llvm.ptr
-      omp.yield
+    omp.wsloop reduction(@add_f32 %0 -> %prv0 : !llvm.ptr, @add_f32 %2 -> %prv1 : !llvm.ptr) {
+      omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+        %1 = llvm.mlir.constant(2.0 : f32) : f32
+        %3 = llvm.load %prv0 : !llvm.ptr -> f32
+        %4 = llvm.fadd %3, %1 : f32
+        llvm.store %4, %prv0 : f32, !llvm.ptr
+        omp.yield
+      }
+      omp.terminator
     }
     omp.terminator
   }
@@ -280,16 +286,18 @@ llvm.func @double_reference(%lb : i64, %ub : i64, %step : i64) {
   %c1 = llvm.mlir.constant(1 : i32) : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   omp.parallel {
-    omp.wsloop reduction(@add_f32 %0 -> %prv : !llvm.ptr)
-    for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-      %1 = llvm.mlir.constant(2.0 : f32) : f32
-      %2 = llvm.load %prv : !llvm.ptr -> f32
-      %3 = llvm.fadd %2, %1 : f32
-      llvm.store %3, %prv : f32, !llvm.ptr
-      %4 = llvm.load %prv : !llvm.ptr -> f32
-      %5 = llvm.fadd %4, %1 : f32
-      llvm.store %5, %prv : f32, !llvm.ptr
-      omp.yield
+    omp.wsloop reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
+      omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+        %1 = llvm.mlir.constant(2.0 : f32) : f32
+        %2 = llvm.load %prv : !llvm.ptr -> f32
+        %3 = llvm.fadd %2, %1 : f32
+        llvm.store %3, %prv : f32, !llvm.ptr
+        %4 = llvm.load %prv : !llvm.ptr -> f32
+        %5 = llvm.fadd %4, %1 : f32
+        llvm.store %5, %prv : f32, !llvm.ptr
+        omp.yield
+      }
+      omp.terminator
     }
     omp.terminator
   }
@@ -374,16 +382,18 @@ llvm.func @no_atomic(%lb : i64, %ub : i64, %step : i64) {
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
   omp.parallel {
-    omp.wsloop reduction(@add_f32 %0 -> %prv0 : !llvm.ptr, @mul_f32 %2 -> %prv1 : !llvm.ptr)
-    for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-      %1 = llvm.mlir.constant(2.0 : f32) : f32
-      %3 = llvm.load %prv0 : !llvm.ptr -> f32
-      %4 = llvm.fadd %3, %1 : f32
-      llvm.store %4, %prv0 : f32, !llvm.ptr
-      %5 = llvm.load %prv1 : !llvm.ptr -> f32
-      %6 = llvm.fmul %5, %1 : f32
-      llvm.store %6, %prv1 : f32, !llvm.ptr
-      omp.yield
+    omp.wsloop reduction(@add_f32 %0 -> %prv0 : !llvm.ptr, @mul_f32 %2 -> %prv1 : !llvm.ptr) {
+      omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+        %1 = llvm.mlir.constant(2.0 : f32) : f32
+        %3 = llvm.load %prv0 : !llvm.ptr -> f32
+        %4 = llvm.fadd %3, %1 : f32
+        llvm.store %4, %prv0 : f32, !llvm.ptr
+        %5 = llvm.load %prv1 : !llvm.ptr -> f32
+        %6 = llvm.fmul %5, %1 : f32
+        llvm.store %6, %prv1 : f32, !llvm.ptr
+        omp.yield
+      }
+      omp.terminator
     }
     omp.terminator
   }
@@ -531,12 +541,15 @@ llvm.func @parallel_nested_workshare_reduction(%ub : i64) {
   %step = llvm.mlir.constant(1 : i64) : i64
   
   omp.parallel reduction(@add_i32 %0 -> %prv : !llvm.ptr) {
-    omp.wsloop for (%iv) : i64 = (%lb) to (%ub) step (%step) {
-      %ival = llvm.trunc %iv : i64 to i32
-      %lprv = llvm.load %prv : !llvm.ptr -> i32
-      %add = llvm.add %lprv, %ival : i32
-      llvm.store %add, %prv : i32, !llvm.ptr
-      omp.yield
+    omp.wsloop {
+      omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
+        %ival = llvm.trunc %iv : i64 to i32
+        %lprv = llvm.load %prv : !llvm.ptr -> i32
+        %add = llvm.add %lprv, %ival : i32
+        llvm.store %add, %prv : i32, !llvm.ptr
+        omp.yield
+      }
+      omp.terminator
     }
     omp.terminator
   }
diff --git a/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
index 3842522934e48e..7a1a31830ce9bc 100644
--- a/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
@@ -30,9 +30,12 @@
     %loop_ub = llvm.mlir.constant(9 : i32) : i32
     %loop_lb = llvm.mlir.constant(0 : i32) : i32
     %loop_step = llvm.mlir.constant(1 : i32) : i32 
-    omp.wsloop byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) for (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
-      llvm.store %0, %arg0 : i32, !llvm.ptr
-      llvm.store %0, %arg1 : i32, !llvm.ptr
+    omp.wsloop byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) {
+      omp.loop_nest (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+        llvm.store %0, %arg0 : i32, !llvm.ptr
+        llvm.store %0, %arg1 : i32, !llvm.ptr
+        omp.yield
+      }
       omp.terminator
     }
     llvm.return



More information about the llvm-branch-commits mailing list