[Mlir-commits] [mlir] 31dacdc - [flang][openmp] Add support for ordered regions in SIMD directives (#181012)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Feb 25 09:49:11 PST 2026
Author: Sunil Shrestha
Date: 2026-02-25T11:49:07-06:00
New Revision: 31dacdc1f5d486da6ef6d8b2f7e3b6126d92c9ff
URL: https://github.com/llvm/llvm-project/commit/31dacdc1f5d486da6ef6d8b2f7e3b6126d92c9ff
DIFF: https://github.com/llvm/llvm-project/commit/31dacdc1f5d486da6ef6d8b2f7e3b6126d92c9ff.diff
LOG: [flang][openmp] Add support for ordered regions in SIMD directives (#181012)
Add support for ordered regions within SIMD directives (!$omp simd
ordered and !$omp do simd ordered). This initial implementation matches
Clang's behavior.
In SIMD directives, loop induction variables have an implicit linear
clause with deferred store semantics (storing to .linear_result). To
properly support ordered regions, the LinearClauseProcessor rewrites
variable references to use .linear_result in:
- omp.ordered.region: Code inside ordered blocks
- omp_region.finalize: Code after ordered blocks
Note: The vectorizer cannot currently vectorize loops with ordered
regions. Future enhancement would require generating lane loops or
unrolling ordered regions across SIMD lanes while maintaining ordering
semantics.
Added:
flang/test/Lower/OpenMP/ordered-simd.f90
mlir/test/Target/LLVMIR/openmp-simd-ordered.mlir
mlir/test/Target/LLVMIR/openmp-wsloop-simd-ordered.mlir
Modified:
flang/lib/Lower/OpenMP/ClauseProcessor.cpp
flang/lib/Lower/OpenMP/ClauseProcessor.h
flang/lib/Lower/OpenMP/OpenMP.cpp
mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
mlir/test/Target/LLVMIR/openmp-todo.mlir
Removed:
################################################################################
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 77fb61274b34f..0ba4eddcfec2a 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -718,6 +718,11 @@ bool ClauseProcessor::processSimdlen(
return false;
}
+bool ClauseProcessor::processSimd(
+ mlir::omp::OrderedRegionOperands &result) const {
+ return markClauseOccurrence<omp::clause::Simd>(result.parLevelSimd);
+}
+
bool ClauseProcessor::processThreadLimit(
lower::StatementContext &stmtCtx,
mlir::omp::ThreadLimitClauseOps &result) const {
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index da920407b2164..28fdc2b2dd0f0 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -116,6 +116,7 @@ class ClauseProcessor {
bool processSchedule(lower::StatementContext &stmtCtx,
mlir::omp::ScheduleClauseOps &result) const;
bool processSimdlen(mlir::omp::SimdlenClauseOps &result) const;
+ bool processSimd(mlir::omp::OrderedRegionOperands &result) const;
bool processThreadLimit(lower::StatementContext &stmtCtx,
mlir::omp::ThreadLimitClauseOps &result) const;
bool processUntied(mlir::omp::UntiedClauseOps &result) const;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 85aaf52227e25..e2018add11206 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1589,7 +1589,7 @@ genOrderedRegionClauses(lower::AbstractConverter &converter,
const List<Clause> &clauses, mlir::Location loc,
mlir::omp::OrderedRegionOperands &clauseOps) {
ClauseProcessor cp(converter, semaCtx, clauses);
- cp.processTODO<clause::Simd>(loc, llvm::omp::Directive::OMPD_ordered);
+ cp.processSimd(clauseOps);
}
static void genParallelClauses(
diff --git a/flang/test/Lower/OpenMP/ordered-simd.f90 b/flang/test/Lower/OpenMP/ordered-simd.f90
new file mode 100644
index 0000000000000..849900993319a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/ordered-simd.f90
@@ -0,0 +1,57 @@
+! This test checks lowering of SIMD constructs with ordered regions.
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+! Test that ordered regions inside SIMD have par_level_simd attribute
+subroutine ordered_simd(n)
+ integer :: n, a(n), b(n), c(n), i
+
+! CHECK-LABEL: func @_QPordered_simd
+! CHECK: omp.simd linear({{.*}}) private({{.*}}) {
+! CHECK: omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
+! CHECK: omp.ordered.region par_level_simd {
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: omp.yield
+! CHECK: }
+! CHECK: } {linear_var_types = [i32]}
+
+ !$omp simd
+ do i = 1, n
+ a(i) = b(i) * 10
+ !$omp ordered simd
+ print *, a(i)
+ !$omp end ordered
+ c(i) = a(i) * 2
+ end do
+ !$omp end simd
+
+end subroutine
+
+! Test that ordered regions inside DO SIMD have par_level_simd attribute
+subroutine ws_ordered_simd(n)
+ integer :: n, a(n), b(n), c(n), i
+
+! CHECK-LABEL: func @_QPws_ordered_simd
+! CHECK: omp.wsloop ordered(0) {
+! CHECK: omp.simd linear({{.*}}) private({{.*}}) {
+! CHECK: omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
+! CHECK: omp.ordered.region par_level_simd {
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: omp.yield
+! CHECK: }
+! CHECK: } {linear_var_types = [i32], omp.composite}
+! CHECK: } {omp.composite}
+
+ !$omp do simd ordered
+ do i = 1, n
+ a(i) = b(i) * 10
+ !$omp ordered simd
+ print *, a(i)
+ !$omp end ordered
+ c(i) = a(i) * 2
+ end do
+ !$omp end do simd
+
+end subroutine
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 38c5802ed60ed..571575762d54a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -354,10 +354,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
if (op.getOrder() || op.getOrderMod())
result = todo("order");
};
- auto checkParLevelSimd = [&todo](auto op, LogicalResult &result) {
- if (op.getParLevelSimd())
- result = todo("parallelization-level");
- };
auto checkPrivate = [&todo](auto op, LogicalResult &result) {
if (!op.getPrivateVars().empty() || op.getPrivateSyms())
result = todo("privatization");
@@ -396,7 +392,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
checkAllocate(op, result);
checkOrder(op, result);
})
- .Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); })
.Case([&](omp::SectionsOp op) {
checkAllocate(op, result);
checkPrivate(op, result);
@@ -3515,9 +3510,26 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
order, simdlen, safelen);
linearClauseProcessor.emitStoresForLinearVar(builder);
- for (size_t index = 0; index < simdOp.getLinearVars().size(); index++)
+
+ // Check if this SIMD loop contains ordered regions
+ bool hasOrderedRegions = false;
+ simdOp.getRegion().walk([&](omp::OrderedRegionOp orderedOp) {
+ hasOrderedRegions = true;
+ return WalkResult::interrupt();
+ });
+
+ for (size_t index = 0; index < simdOp.getLinearVars().size(); index++) {
linearClauseProcessor.rewriteInPlace(builder, "omp.loop_nest.region",
index);
+ if (hasOrderedRegions) {
+ // Also rewrite uses in ordered regions so they read the current value
+ linearClauseProcessor.rewriteInPlace(builder, "omp.ordered.region",
+ index);
+ // Also rewrite uses in finalize blocks (code after ordered regions)
+ linearClauseProcessor.rewriteInPlace(builder, "omp_region.finalize",
+ index);
+ }
+ }
// We now need to reduce the per-simd-lane reduction variable into the
// original variable. This works a bit
diff erently to other reductions (e.g.
diff --git a/mlir/test/Target/LLVMIR/openmp-simd-ordered.mlir b/mlir/test/Target/LLVMIR/openmp-simd-ordered.mlir
new file mode 100644
index 0000000000000..3d5addc833778
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-simd-ordered.mlir
@@ -0,0 +1,87 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Test that linear variables in SIMD loops with ordered regions
+// are correctly rewritten to use .linear_result in:
+// 1. The ordered region (omp.ordered.region)
+// 2. Code after the ordered region (omp_region.finalize)
+//
+// This tests "omp ordered simd" nested in "omp simd ordered"
+// !$omp simd
+// do i = 1, n
+// a(i) = b(i) * 10
+// !$omp ordered simd
+// print *, a(i)
+// !$omp end ordered
+// c(i) = a(i) * 2
+// end do
+// !$omp end simd
+
+module {
+ omp.private {type = private} @i_private_i32 : i32
+
+ // CHECK-LABEL: define void @simd_ordered_linear
+ llvm.func @simd_ordered_linear() {
+ %c0_i64 = llvm.mlir.constant(0 : i64) : i64
+ %c1_i64 = llvm.mlir.constant(1 : i64) : i64
+ %c1_i32 = llvm.mlir.constant(1 : i32) : i32
+ %c10_i32 = llvm.mlir.constant(10 : i32) : i32
+ %c10_val = llvm.mlir.constant(10 : i32) : i32
+ %c2 = llvm.mlir.constant(2 : i32) : i32
+
+ // Allocate arrays and loop variable
+ %c100_i64 = llvm.mlir.constant(100 : i64) : i64
+ %a = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+ %b = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+ %c = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+ %i = llvm.alloca %c1_i64 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+
+ // CHECK: %.linear_var = alloca i32
+ // CHECK: %.linear_result = alloca i32
+
+ omp.simd linear(%i = %c1_i32 : !llvm.ptr) private(@i_private_i32 %i -> %arg0 : !llvm.ptr) {
+ omp.loop_nest (%iv) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+ // CHECK: omp.loop_nest.region:
+ // CHECK: load i32, ptr %.linear_result
+ llvm.store %iv, %arg0 : i32, !llvm.ptr
+
+ // Compute a[i] = b[i] * 10
+ %i_val = llvm.load %arg0 : !llvm.ptr -> i32
+ %i_idx = llvm.sext %i_val : i32 to i64
+ %i_off = llvm.sub %i_idx, %c1_i64 : i64
+ %b_ptr = llvm.getelementptr %b[%i_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ %b_val = llvm.load %b_ptr : !llvm.ptr -> i32
+ %a_val = llvm.mul %b_val, %c10_val : i32
+ %a_ptr = llvm.getelementptr %a[%i_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ llvm.store %a_val, %a_ptr : i32, !llvm.ptr
+
+ // Ordered region
+ omp.ordered.region par_level_simd {
+ // CHECK: omp.ordered.region:
+ // CHECK: load i32, ptr %.linear_result
+ %i_ord = llvm.load %arg0 : !llvm.ptr -> i32
+ %i_ord_idx = llvm.sext %i_ord : i32 to i64
+ %i_ord_off = llvm.sub %i_ord_idx, %c1_i64 : i64
+ %a_ord_ptr = llvm.getelementptr %a[%i_ord_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ %a_ord_val = llvm.load %a_ord_ptr : !llvm.ptr -> i32
+ omp.terminator
+ }
+
+ // Compute c[i] = a[i] * 2 (code after ordered region)
+ // CHECK: omp_region.finalize:
+ // CHECK: load i32, ptr %.linear_result
+ %i_post = llvm.load %arg0 : !llvm.ptr -> i32
+ %i_post_idx = llvm.sext %i_post : i32 to i64
+ %i_post_off = llvm.sub %i_post_idx, %c1_i64 : i64
+ %a_post_ptr = llvm.getelementptr %a[%i_post_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ %a_post_val = llvm.load %a_post_ptr : !llvm.ptr -> i32
+ %c_val = llvm.mul %a_post_val, %c2 : i32
+ %c_ptr = llvm.getelementptr %c[%i_post_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ llvm.store %c_val, %c_ptr : i32, !llvm.ptr
+
+ omp.yield
+ }
+ } {linear_var_types = [i32]}
+ llvm.return
+ }
+ // CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 9a10ad74baeb6..1a1286cb30251 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -52,17 +52,6 @@ llvm.func @distribute_order(%lb : i32, %ub : i32, %step : i32) {
// -----
-llvm.func @ordered_region_par_level_simd() {
- // expected-error at below {{not yet implemented: Unhandled clause parallelization-level in omp.ordered.region operation}}
- // expected-error at below {{LLVM Translation failed for operation: omp.ordered.region}}
- omp.ordered.region par_level_simd {
- omp.terminator
- }
- llvm.return
-}
-
-// -----
-
llvm.func @parallel_allocate(%x : !llvm.ptr) {
// expected-error at below {{not yet implemented: Unhandled clause allocate in omp.parallel operation}}
// expected-error at below {{LLVM Translation failed for operation: omp.parallel}}
diff --git a/mlir/test/Target/LLVMIR/openmp-wsloop-simd-ordered.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-simd-ordered.mlir
new file mode 100644
index 0000000000000..d43f92ce41752
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-wsloop-simd-ordered.mlir
@@ -0,0 +1,90 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Test that linear variables in worksharing+SIMD loops with ordered regions
+// are correctly rewritten to use .linear_result in:
+// 1. The ordered region (omp.ordered.region)
+// 2. Code after the ordered region (omp_region.finalize)
+//
+// This tests "omp ordered simd" nested in "omp do simd ordered" case
+// !$omp do simd ordered
+// do i = 1, n
+// a(i) = b(i) * 10
+// !$omp ordered simd
+// print *, a(i)
+// !$omp end ordered
+// c(i) = a(i) * 2
+// end do
+// !$omp end do simd
+
+module {
+ omp.private {type = private} @i_private_i32 : i32
+
+ // CHECK-LABEL: define void @wsloop_simd_ordered_linear
+ llvm.func @wsloop_simd_ordered_linear() {
+ %c0_i64 = llvm.mlir.constant(0 : i64) : i64
+ %c1_i64 = llvm.mlir.constant(1 : i64) : i64
+ %c1_i32 = llvm.mlir.constant(1 : i32) : i32
+ %c100_i32 = llvm.mlir.constant(100 : i32) : i32
+ %c10_val = llvm.mlir.constant(10 : i32) : i32
+ %c2 = llvm.mlir.constant(2 : i32) : i32
+
+ // Allocate arrays and loop variable
+ %c100_i64 = llvm.mlir.constant(100 : i64) : i64
+ %a = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+ %b = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+ %c = llvm.alloca %c100_i64 x i32 : (i64) -> !llvm.ptr
+ %i = llvm.alloca %c1_i64 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+
+ // CHECK: %.linear_var = alloca i32
+ // CHECK: %.linear_result = alloca i32
+
+ omp.wsloop ordered(0) {
+ omp.simd linear(%i = %c1_i32 : !llvm.ptr) private(@i_private_i32 %i -> %arg0 : !llvm.ptr) {
+ omp.loop_nest (%iv) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32) {
+ // CHECK: omp.loop_nest.region:
+ // CHECK: load i32, ptr %.linear_result
+ llvm.store %iv, %arg0 : i32, !llvm.ptr
+
+ // Compute a[i] = b[i] * 10
+ %i_val = llvm.load %arg0 : !llvm.ptr -> i32
+ %i_idx = llvm.sext %i_val : i32 to i64
+ %i_off = llvm.sub %i_idx, %c1_i64 : i64
+ %b_ptr = llvm.getelementptr %b[%i_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ %b_val = llvm.load %b_ptr : !llvm.ptr -> i32
+ %a_val = llvm.mul %b_val, %c10_val : i32
+ %a_ptr = llvm.getelementptr %a[%i_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ llvm.store %a_val, %a_ptr : i32, !llvm.ptr
+
+ // Ordered region
+ omp.ordered.region par_level_simd {
+ // CHECK: omp.ordered.region:
+ // CHECK: load i32, ptr %.linear_result
+ %i_ord = llvm.load %arg0 : !llvm.ptr -> i32
+ %i_ord_idx = llvm.sext %i_ord : i32 to i64
+ %i_ord_off = llvm.sub %i_ord_idx, %c1_i64 : i64
+ %a_ord_ptr = llvm.getelementptr %a[%i_ord_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ %a_ord_val = llvm.load %a_ord_ptr : !llvm.ptr -> i32
+
+ omp.terminator
+ }
+
+ // Compute c[i] = a[i] * 2 (code after ordered region)
+ // CHECK: omp_region.finalize:
+ // CHECK: load i32, ptr %.linear_result
+ %i_post = llvm.load %arg0 : !llvm.ptr -> i32
+ %i_post_idx = llvm.sext %i_post : i32 to i64
+ %i_post_off = llvm.sub %i_post_idx, %c1_i64 : i64
+ %a_post_ptr = llvm.getelementptr %a[%i_post_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ %a_post_val = llvm.load %a_post_ptr : !llvm.ptr -> i32
+ %c_val = llvm.mul %a_post_val, %c2 : i32
+ %c_ptr = llvm.getelementptr %c[%i_post_off] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+ llvm.store %c_val, %c_ptr : i32, !llvm.ptr
+
+ omp.yield
+ }
+ } {linear_var_types = [i32], omp.composite}
+ } {omp.composite}
+ llvm.return
+ }
+ // CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
+}
More information about the Mlir-commits
mailing list