[Mlir-commits] [mlir] 31dacdc - [flang][openmp] Add support for ordered regions in SIMD directives (#181012)

Wed Feb 25 09:49:11 PST 2026

Author: Sunil Shrestha
Date: 2026-02-25T11:49:07-06:00
New Revision: 31dacdc1f5d486da6ef6d8b2f7e3b6126d92c9ff

URL: https://github.com/llvm/llvm-project/commit/31dacdc1f5d486da6ef6d8b2f7e3b6126d92c9ff
DIFF: https://github.com/llvm/llvm-project/commit/31dacdc1f5d486da6ef6d8b2f7e3b6126d92c9ff.diff

LOG: [flang][openmp] Add support for ordered regions in SIMD directives (#181012)

Add support for ordered regions within SIMD directives (!$omp simd
ordered and !$omp do simd ordered). This initial implementation matches
Clang's behavior.

In SIMD directives, loop induction variables have an implicit linear
clause with deferred store semantics (storing to .linear_result). To
properly support ordered regions, the LinearClauseProcessor rewrites
variable references to use .linear_result in:
- omp.ordered.region: Code inside ordered blocks
- omp_region.finalize: Code after ordered blocks

Note: The vectorizer cannot currently vectorize loops with ordered
regions. Future enhancement would require generating lane loops or
unrolling ordered regions across SIMD lanes while maintaining ordering
semantics.

Added: 
    flang/test/Lower/OpenMP/ordered-simd.f90
    mlir/test/Target/LLVMIR/openmp-simd-ordered.mlir
    mlir/test/Target/LLVMIR/openmp-wsloop-simd-ordered.mlir

Modified: 
    flang/lib/Lower/OpenMP/ClauseProcessor.cpp
    flang/lib/Lower/OpenMP/ClauseProcessor.h
    flang/lib/Lower/OpenMP/OpenMP.cpp
    mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
    mlir/test/Target/LLVMIR/openmp-todo.mlir

Removed: 
    


################################################################################
diff  --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 77fb61274b34f..0ba4eddcfec2a 100644

--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -718,6 +718,11 @@ bool ClauseProcessor::processSimdlen(
   return false;
 }
 
+bool ClauseProcessor::processSimd(
+    mlir::omp::OrderedRegionOperands &result) const {
+  return markClauseOccurrence<omp::clause::Simd>(result.parLevelSimd);
+}
+
 bool ClauseProcessor::processThreadLimit(
     lower::StatementContext &stmtCtx,
     mlir::omp::ThreadLimitClauseOps &result) const {

diff  --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index da920407b2164..28fdc2b2dd0f0 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -116,6 +116,7 @@ class ClauseProcessor {
   bool processSchedule(lower::StatementContext &stmtCtx,
                        mlir::omp::ScheduleClauseOps &result) const;
   bool processSimdlen(mlir::omp::SimdlenClauseOps &result) const;
+  bool processSimd(mlir::omp::OrderedRegionOperands &result) const;
   bool processThreadLimit(lower::StatementContext &stmtCtx,
                           mlir::omp::ThreadLimitClauseOps &result) const;
   bool processUntied(mlir::omp::UntiedClauseOps &result) const;

diff  --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 85aaf52227e25..e2018add11206 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1589,7 +1589,7 @@ genOrderedRegionClauses(lower::AbstractConverter &converter,
                         const List<Clause> &clauses, mlir::Location loc,
                         mlir::omp::OrderedRegionOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
-  cp.processTODO<clause::Simd>(loc, llvm::omp::Directive::OMPD_ordered);
+  cp.processSimd(clauseOps);
 }
 
 static void genParallelClauses(

diff  --git a/flang/test/Lower/OpenMP/ordered-simd.f90 b/flang/test/Lower/OpenMP/ordered-simd.f90
new file mode 100644
index 0000000000000..849900993319a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/ordered-simd.f90
@@ -0,0 +1,57 @@
+! This test checks lowering of SIMD constructs with ordered regions.
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+! Test that ordered regions inside SIMD have par_level_simd attribute
+subroutine ordered_simd(n)
+  integer :: n, a(n), b(n), c(n), i
+
+! CHECK-LABEL: func @_QPordered_simd
+! CHECK:         omp.simd linear({{.*}}) private({{.*}}) {
+! CHECK:           omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
+! CHECK:             omp.ordered.region par_level_simd {
+! CHECK:               omp.terminator
+! CHECK:             }
+! CHECK:             omp.yield
+! CHECK:           }
+! CHECK:         } {linear_var_types = [i32]}
+
+  !$omp simd
+  do i = 1, n
+    a(i) = b(i) * 10
+    !$omp ordered simd
+    print *, a(i)
+    !$omp end ordered
+    c(i) = a(i) * 2
+  end do
+  !$omp end simd
+
+end subroutine
+
+! Test that ordered regions inside DO SIMD have par_level_simd attribute
+subroutine ws_ordered_simd(n)
+  integer :: n, a(n), b(n), c(n), i
+
+! CHECK-LABEL: func @_QPws_ordered_simd
+! CHECK:         omp.wsloop ordered(0) {
+! CHECK:           omp.simd linear({{.*}}) private({{.*}}) {
+! CHECK:             omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
+! CHECK:               omp.ordered.region par_level_simd {
+! CHECK:                 omp.terminator
+! CHECK:               }
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:           } {linear_var_types = [i32], omp.composite}
+! CHECK:         } {omp.composite}
+
+  !$omp do simd ordered
+  do i = 1, n
+    a(i) = b(i) * 10
+    !$omp ordered simd
+    print *, a(i)
+    !$omp end ordered
+    c(i) = a(i) * 2
+  end do
+  !$omp end do simd
+
+end subroutine

diff  --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 38c5802ed60ed..571575762d54a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -354,10 +354,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
     if (op.getOrder() || op.getOrderMod())
       result = todo("order");
   };
-  auto checkParLevelSimd = [&todo](auto op, LogicalResult &result) {
-    if (op.getParLevelSimd())
-      result = todo("parallelization-level");
-  };
   auto checkPrivate = [&todo](auto op, LogicalResult &result) {
     if (!op.getPrivateVars().empty() || op.getPrivateSyms())
       result = todo("privatization");
@@ -396,7 +392,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkAllocate(op, result);
         checkOrder(op, result);
       })
-      .Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); })
       .Case([&](omp::SectionsOp op) {
         checkAllocate(op, result);
         checkPrivate(op, result);
@@ -3515,9 +3510,26 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
                         order, simdlen, safelen);
 
   linearClauseProcessor.emitStoresForLinearVar(builder);
-  for (size_t index = 0; index < simdOp.getLinearVars().size(); index++)
+
+  // Check if this SIMD loop contains ordered regions
+  bool hasOrderedRegions = false;
+  simdOp.getRegion().walk([&](omp::OrderedRegionOp orderedOp) {
+    hasOrderedRegions = true;
+    return WalkResult::interrupt();
+  });
+
+  for (size_t index = 0; index < simdOp.getLinearVars().size(); index++) {
     linearClauseProcessor.rewriteInPlace(builder, "omp.loop_nest.region",
                                          index);
+    if (hasOrderedRegions) {
+      // Also rewrite uses in ordered regions so they read the current value
+      linearClauseProcessor.rewriteInPlace(builder, "omp.ordered.region",
+                                           index);
+      // Also rewrite uses in finalize blocks (code after ordered regions)
+      linearClauseProcessor.rewriteInPlace(builder, "omp_region.finalize",
+                                           index);
+    }
+  }
 
   // We now need to reduce the per-simd-lane reduction variable into the
   // original variable. This works a bit 
diff erently to other reductions (e.g.

diff  --git a/mlir/test/Target/LLVMIR/openmp-simd-ordered.mlir b/mlir/test/Target/LLVMIR/openmp-simd-ordered.mlir
new file mode 100644
index 0000000000000..3d5addc833778
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-simd-ordered.mlir
@@ -0,0 +1,87 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Test that linear variables in SIMD loops with ordered regions
+// are correctly rewritten to use .linear_result in:
+// 1. The ordered region (omp.ordered.region)
+// 2. Code after the ordered region (omp_region.finalize)
+//
+// This tests "omp ordered simd" nested in  "omp simd ordered"
+// !$omp simd
+// do i = 1, n
+//     a(i) = b(i) * 10
+//         !$omp ordered simd
+//             print *, a(i)
+//         !$omp end ordered
+//     c(i) = a(i) * 2
+// end do
+// !$omp end simd
+
+module {
+  omp.private {type = private} @i_private_i32 : i32
+
+  // CHECK-LABEL: define void @simd_ordered_linear
+  llvm.func @simd_ordered_linear() {
+    %c0_i64 = llvm.mlir.constant(0 : i64) : i64
+    %c1_i64 = llvm.mlir.constant(1 : i64) : i64
+    %c1_i32 = llvm.mlir.constant(1 : i32) : i32
+    %c10_i32 = llvm.mlir.constant(10 : i32) : i32
+    %c10_val = llvm.mlir.constant(10 : i32) : i32
+    %c2 = llvm.mlir.constant(2 : i32) : i32
+
+    // Allocate arrays and loop variable
+    %c100_i64 = llvm.mlir.constant(100 : i64) : i64
+    %a = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+    %b = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+    %c = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+    %i = llvm.alloca %c1_i64 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+
+    // CHECK: %.linear_var = alloca i32
+    // CHECK: %.linear_result = alloca i32
+
+    omp.simd linear(%i = %c1_i32 : !llvm.ptr) private(@i_private_i32 %i -> %arg0 : !llvm.ptr) {
+      omp.loop_nest (%iv) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+        // CHECK: omp.loop_nest.region:
+        // CHECK: load i32, ptr %.linear_result
+        llvm.store %iv, %arg0 : i32, !llvm.ptr
+
+        // Compute a[i] = b[i] * 10
+        %i_val = llvm.load %arg0 : !llvm.ptr -> i32
+        %i_idx = llvm.sext %i_val : i32 to i64
+        %i_off = llvm.sub %i_idx, %c1_i64 : i64
+        %b_ptr = llvm.getelementptr %b[%i_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+        %b_val = llvm.load %b_ptr : !llvm.ptr -> i32
+        %a_val = llvm.mul %b_val, %c10_val : i32
+        %a_ptr = llvm.getelementptr %a[%i_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+        llvm.store %a_val, %a_ptr : i32, !llvm.ptr
+
+        // Ordered region
+        omp.ordered.region par_level_simd {
+          // CHECK: omp.ordered.region:
+          // CHECK: load i32, ptr %.linear_result
+          %i_ord = llvm.load %arg0 : !llvm.ptr -> i32
+          %i_ord_idx = llvm.sext %i_ord : i32 to i64
+          %i_ord_off = llvm.sub %i_ord_idx, %c1_i64 : i64
+          %a_ord_ptr = llvm.getelementptr %a[%i_ord_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+          %a_ord_val = llvm.load %a_ord_ptr : !llvm.ptr -> i32
+          omp.terminator
+        }
+
+        // Compute c[i] = a[i] * 2 (code after ordered region)
+        // CHECK: omp_region.finalize:
+        // CHECK: load i32, ptr %.linear_result
+        %i_post = llvm.load %arg0 : !llvm.ptr -> i32
+        %i_post_idx = llvm.sext %i_post : i32 to i64
+        %i_post_off = llvm.sub %i_post_idx, %c1_i64 : i64
+        %a_post_ptr = llvm.getelementptr %a[%i_post_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+        %a_post_val = llvm.load %a_post_ptr : !llvm.ptr -> i32
+        %c_val = llvm.mul %a_post_val, %c2 : i32
+        %c_ptr = llvm.getelementptr %c[%i_post_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+        llvm.store %c_val, %c_ptr : i32, !llvm.ptr
+
+        omp.yield
+      }
+    } {linear_var_types = [i32]}
+    llvm.return
+  }
+  // CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
+}

diff  --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 9a10ad74baeb6..1a1286cb30251 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -52,17 +52,6 @@ llvm.func @distribute_order(%lb : i32, %ub : i32, %step : i32) {
 
 // -----
 
-llvm.func @ordered_region_par_level_simd() {
-  // expected-error at below {{not yet implemented: Unhandled clause parallelization-level in omp.ordered.region operation}}
-  // expected-error at below {{LLVM Translation failed for operation: omp.ordered.region}}
-  omp.ordered.region par_level_simd {
-    omp.terminator
-  }
-  llvm.return
-}
-
-// -----
-
 llvm.func @parallel_allocate(%x : !llvm.ptr) {
   // expected-error at below {{not yet implemented: Unhandled clause allocate in omp.parallel operation}}
   // expected-error at below {{LLVM Translation failed for operation: omp.parallel}}

diff  --git a/mlir/test/Target/LLVMIR/openmp-wsloop-simd-ordered.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-simd-ordered.mlir
new file mode 100644
index 0000000000000..d43f92ce41752
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-wsloop-simd-ordered.mlir
@@ -0,0 +1,90 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Test that linear variables in worksharing+SIMD loops with ordered regions
+// are correctly rewritten to use .linear_result in:
+// 1. The ordered region (omp.ordered.region)
+// 2. Code after the ordered region (omp_region.finalize)
+//
+// This tests "omp ordered simd" nested in  "omp do simd ordered" case
+// !$omp do simd ordered
+// do i = 1, n
+//     a(i) = b(i) * 10
+//         !$omp ordered simd
+//             print *, a(i)
+//         !$omp end ordered
+//     c(i) = a(i) * 2
+// end do
+// !$omp end do simd
+
+module {
+  omp.private {type = private} @i_private_i32 : i32
+
+  // CHECK-LABEL: define void @wsloop_simd_ordered_linear
+  llvm.func @wsloop_simd_ordered_linear() {
+    %c0_i64 = llvm.mlir.constant(0 : i64) : i64
+    %c1_i64 = llvm.mlir.constant(1 : i64) : i64
+    %c1_i32 = llvm.mlir.constant(1 : i32) : i32
+    %c100_i32 = llvm.mlir.constant(100 : i32) : i32
+    %c10_val = llvm.mlir.constant(10 : i32) : i32
+    %c2 = llvm.mlir.constant(2 : i32) : i32
+
+    // Allocate arrays and loop variable
+    %c100_i64 = llvm.mlir.constant(100 : i64) : i64
+    %a = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+    %b = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+    %c = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+    %i = llvm.alloca %c1_i64 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+
+    // CHECK: %.linear_var = alloca i32
+    // CHECK: %.linear_result = alloca i32
+
+    omp.wsloop ordered(0) {
+      omp.simd linear(%i = %c1_i32 : !llvm.ptr) private(@i_private_i32 %i -> %arg0 : !llvm.ptr) {
+        omp.loop_nest (%iv) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32) {
+          // CHECK: omp.loop_nest.region:
+          // CHECK: load i32, ptr %.linear_result
+          llvm.store %iv, %arg0 : i32, !llvm.ptr
+
+          // Compute a[i] = b[i] * 10
+          %i_val = llvm.load %arg0 : !llvm.ptr -> i32
+          %i_idx = llvm.sext %i_val : i32 to i64
+          %i_off = llvm.sub %i_idx, %c1_i64 : i64
+          %b_ptr = llvm.getelementptr %b[%i_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+          %b_val = llvm.load %b_ptr : !llvm.ptr -> i32
+          %a_val = llvm.mul %b_val, %c10_val : i32
+          %a_ptr = llvm.getelementptr %a[%i_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+          llvm.store %a_val, %a_ptr : i32, !llvm.ptr
+
+          // Ordered region
+          omp.ordered.region par_level_simd {
+            // CHECK: omp.ordered.region:
+            // CHECK: load i32, ptr %.linear_result
+            %i_ord = llvm.load %arg0 : !llvm.ptr -> i32
+            %i_ord_idx = llvm.sext %i_ord : i32 to i64
+            %i_ord_off = llvm.sub %i_ord_idx, %c1_i64 : i64
+            %a_ord_ptr = llvm.getelementptr %a[%i_ord_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+            %a_ord_val = llvm.load %a_ord_ptr : !llvm.ptr -> i32
+
+            omp.terminator
+          }
+
+          // Compute c[i] = a[i] * 2 (code after ordered region)
+          // CHECK: omp_region.finalize:
+          // CHECK: load i32, ptr %.linear_result
+          %i_post = llvm.load %arg0 : !llvm.ptr -> i32
+          %i_post_idx = llvm.sext %i_post : i32 to i64
+          %i_post_off = llvm.sub %i_post_idx, %c1_i64 : i64
+          %a_post_ptr = llvm.getelementptr %a[%i_post_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+          %a_post_val = llvm.load %a_post_ptr : !llvm.ptr -> i32
+          %c_val = llvm.mul %a_post_val, %c2 : i32
+          %c_ptr = llvm.getelementptr %c[%i_post_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+          llvm.store %c_val, %c_ptr : i32, !llvm.ptr
+
+          omp.yield
+        }
+      } {linear_var_types = [i32], omp.composite}
+    } {omp.composite}
+    llvm.return
+  }
+  // CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
+}