[flang-commits] [flang] [flang][OpenMP] Implement collapse for imperfectly nested loops (PR #202435)

Caroline Newcombe via flang-commits flang-commits at lists.llvm.org
Tue Jun 23 08:14:56 PDT 2026


https://github.com/cenewcombe updated https://github.com/llvm/llvm-project/pull/202435

>From 5225e3417bf4df48c4a0fa0e353d58ba17bc8b3a Mon Sep 17 00:00:00 2001
From: Caroline Newcombe <caroline.newcombe at hpe.com>
Date: Mon, 1 Jun 2026 10:31:10 -0500
Subject: [PATCH 1/3] [flang][OpenMP] Implement collapse for imperfectly nested
 loops

---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 165 ++++-
 flang/lib/Lower/OpenMP/Utils.cpp              |   3 +-
 flang/lib/Semantics/check-omp-loop.cpp        |  50 ++
 flang/lib/Semantics/openmp-utils.cpp          |  17 +-
 .../Lower/OpenMP/collapse-imperfect-nest.f90  | 604 ++++++++++++++++++
 flang/test/Semantics/OpenMP/do-collapse.f90   |   8 +-
 .../OpenMP/do-concurrent-collapse-60.f90      |   4 +-
 .../OpenMP/do-concurrent-collapse.f90         |   4 +-
 flang/test/Semantics/OpenMP/do08.f90          |  15 -
 flang/test/Semantics/OpenMP/do10.f90          |   2 +-
 flang/test/Semantics/OpenMP/do13.f90          |  10 +-
 flang/test/Semantics/OpenMP/do15.f90          |   9 -
 flang/test/Semantics/OpenMP/do16.f90          |   6 -
 flang/test/Semantics/OpenMP/do22.f90          | 127 +++-
 .../OpenMP/doacross-nesting-omp52.f90         | 110 ++++
 15 files changed, 1076 insertions(+), 58 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/collapse-imperfect-nest.f90
 create mode 100644 flang/test/Semantics/OpenMP/doacross-nesting-omp52.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index ac75277e001da..f9eb59cbe0b95 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -36,6 +36,7 @@
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Parser/characters.h"
@@ -678,6 +679,145 @@ static void genNestedEvaluations(lower::AbstractConverter &converter,
     converter.genEval(e);
 }
 
+/// Emit the body of a collapsed loop nest, including any intervening code
+/// from imperfect nesting at intermediate levels (CLN relaxation, applied
+/// retroactively for all OMP versions).
+///
+/// Because omp.loop_nest places its entire body at the innermost nesting
+/// level, intervening code must be guarded so that it only executes on the
+/// iterations where the corresponding inner induction variables are at their
+/// initial (for intervening code before nested loop) or final (for intervening
+/// code after nested loop) values.
+///
+/// \param [in] converter - PFT to MLIR conversion interface.
+/// \param [in] outerEval - the evaluation containing the outermost loop
+///                         (typically the OpenMP construct evaluation).
+/// \param [in] collapseValue - number of loops being collapsed (>= 1).
+static void genCollapsedLoopNestBody(lower::AbstractConverter &converter,
+                                     lower::pft::Evaluation &outerEval,
+                                     int collapseValue) {
+  assert(collapseValue >= 1);
+  if (collapseValue == 1) {
+    genNestedEvaluations(converter, outerEval, /*collapseValue=*/1);
+    return;
+  }
+
+  fir::FirOpBuilder &firOpBuilder{converter.getFirOpBuilder()};
+  const mlir::Location loc{converter.getCurrentLocation()};
+
+  // Get the enclosing omp.loop_nest to access induction variables and bounds.
+  auto loopNestOp{mlir::dyn_cast<mlir::omp::LoopNestOp>(
+      firOpBuilder.getInsertionBlock()->getParentOp())};
+  assert(loopNestOp && "expected to be inside omp.loop_nest");
+
+  // Collect before/after evaluations at each intermediate level.
+  struct LevelInfo {
+    llvm::SmallVector<lower::pft::Evaluation *> before;
+    llvm::SmallVector<lower::pft::Evaluation *> after;
+  };
+  llvm::SmallVector<LevelInfo> levels;
+
+  lower::pft::Evaluation *curEval{&outerEval};
+  for (int i{0}; i < collapseValue - 1; ++i) {
+    lower::pft::Evaluation *const doEval{getNestedDoConstruct(*curEval)};
+    LevelInfo level;
+    bool pastDo{false};
+    for (lower::pft::Evaluation &e : doEval->getNestedEvaluations()) {
+      if (e.getIf<parser::NonLabelDoStmt>() || e.getIf<parser::EndDoStmt>())
+        continue;
+      // Semantics guarantees the only DoConstruct here is the next associated
+      // loop (non-associated DO loops are rejected as intervening code).
+      if (e.getIf<parser::DoConstruct>()) {
+        pastDo = true;
+        continue;
+      }
+      if (!pastDo)
+        level.before.push_back(&e);
+      else
+        level.after.push_back(&e);
+    }
+    levels.push_back(std::move(level));
+    curEval = doEval;
+  }
+
+  // Build a guard condition: all induction variables from
+  // startLevel..endLevel-1 equal their respective bound values.
+  // For "before" guards (useLowerBound=true), compare iv == lb (first iter).
+  // For "after" guards (useLowerBound=false), compare iv == last_iv where
+  // last_iv = lb + ((ub - lb) / step) * step, which accounts for non-unit
+  // steps where the IV may never exactly equal the upper bound.
+  auto buildGuard = [&](const int startLevel, const int endLevel,
+                        const bool useLowerBound) -> mlir::Value {
+    mlir::Value cond{};
+    const auto lbs{loopNestOp.getLoopLowerBounds()};
+    const auto ubs{loopNestOp.getLoopUpperBounds()};
+    const auto steps{loopNestOp.getLoopSteps()};
+    for (int lvl{startLevel}; lvl < endLevel; ++lvl) {
+      const mlir::Value iv{loopNestOp.getRegion().getArgument(lvl)};
+      mlir::Value target;
+      if (useLowerBound) {
+        target = lbs[lvl];
+      } else {
+        // For unit steps, the last induction variable always equals ub.
+        const auto constStep{fir::getIntIfConstant(steps[lvl])};
+        if (constStep && (*constStep == 1 || *constStep == -1)) {
+          target = ubs[lvl];
+        } else {
+          // Compute last_iv = lb + ((ub - lb) / step) * step.
+          const mlir::Value lb{lbs[lvl]};
+          const mlir::Value ub{ubs[lvl]};
+          const mlir::Value step{steps[lvl]};
+          const mlir::Value range{
+              mlir::arith::SubIOp::create(firOpBuilder, loc, ub, lb)};
+          const mlir::Value tripMinus1{
+              mlir::arith::DivSIOp::create(firOpBuilder, loc, range, step)};
+          const mlir::Value lastOffset{
+              mlir::arith::MulIOp::create(firOpBuilder, loc, tripMinus1, step)};
+          target =
+              mlir::arith::AddIOp::create(firOpBuilder, loc, lb, lastOffset);
+        }
+      }
+      const mlir::Value cmp = mlir::arith::CmpIOp::create(
+          firOpBuilder, loc, mlir::arith::CmpIPredicate::eq, iv, target);
+      if (!cond)
+        cond = cmp;
+      else
+        cond = mlir::arith::AndIOp::create(firOpBuilder, loc, cond, cmp);
+    }
+    return cond;
+  };
+
+  // Emit "before" code at each level, guarded by inner IVs == lower bounds.
+  for (int i{0}; i < static_cast<int>(levels.size()); ++i) {
+    if (levels[i].before.empty())
+      continue;
+    const mlir::Value guard{
+        buildGuard(i + 1, collapseValue, /*useLowerBound=*/true)};
+    auto ifOp{fir::IfOp::create(firOpBuilder, loc, guard, /*else*/ false)};
+    firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+    for (auto *e : levels[i].before)
+      converter.genEval(*e);
+    firOpBuilder.setInsertionPointAfter(ifOp);
+  }
+
+  // Emit innermost loop body.
+  genNestedEvaluations(converter, *curEval, /*collapseValue=*/1);
+
+  // Emit "after" code at each level (innermost first), guarded by
+  // inner IVs == last iteration values (accounts for non-unit steps).
+  for (int i{static_cast<int>(levels.size()) - 1}; i >= 0; --i) {
+    if (levels[i].after.empty())
+      continue;
+    const mlir::Value guard{
+        buildGuard(i + 1, collapseValue, /*useLowerBound=*/false)};
+    auto ifOp{fir::IfOp::create(firOpBuilder, loc, guard, /*else*/ false)};
+    firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+    for (auto *e : levels[i].after)
+      converter.genEval(*e);
+    firOpBuilder.setInsertionPointAfter(ifOp);
+  }
+}
+
 static fir::GlobalOp globalInitialization(lower::AbstractConverter &converter,
                                           fir::FirOpBuilder &firOpBuilder,
                                           const semantics::Symbol &sym,
@@ -1233,6 +1373,13 @@ struct OpWithBodyGenInfo {
     return *this;
   }
 
+  OpWithBodyGenInfo &setCollapseInfo(int value,
+                                     lower::pft::Evaluation &outerEval) {
+    collapseValue = value;
+    outerCollapseEval = &outerEval;
+    return *this;
+  }
+
   /// [inout] converter to use for the clauses.
   lower::AbstractConverter &converter;
   /// [in] Symbol table
@@ -1261,6 +1408,10 @@ struct OpWithBodyGenInfo {
   bool genSkeletonOnly = false;
   /// [in] enables handling of privatized variable unless set to `false`.
   bool privatize = true;
+  /// [in] if set, outermost evaluation and collapse depth for emitting
+  /// intervening code from imperfect collapsed loop nests.
+  lower::pft::Evaluation *outerCollapseEval{nullptr};
+  int collapseValue{0};
 };
 
 /// Create the body (block) for an OpenMP Operation.
@@ -1355,7 +1506,11 @@ static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info,
       firOpBuilder.setInsertionPointToEnd(&op.getRegion(0).back());
       auto *temp = lower::genOpenMPTerminator(firOpBuilder, &op, info.loc);
       firOpBuilder.setInsertionPointAfter(marker);
-      genNestedEvaluations(info.converter, info.eval);
+      if (info.outerCollapseEval)
+        genCollapsedLoopNestBody(info.converter, *info.outerCollapseEval,
+                                 info.collapseValue);
+      else
+        genNestedEvaluations(info.converter, info.eval);
       temp->erase();
     }
   }
@@ -2192,7 +2347,8 @@ genLoopNestOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                         directive)
           .setClauses(&item->clauses)
           .setDataSharingProcessor(&dsp)
-          .setGenRegionEntryCb(ivCallback),
+          .setGenRegionEntryCb(ivCallback)
+          .setCollapseInfo(nestValue, eval),
       queue, item, clauseOps);
 }
 
@@ -2282,7 +2438,7 @@ static void genCanonicalLoopNest(
   // Step 1: Loop prologues
   // Computing the trip count must happen before entering the outermost loop
   lower::pft::Evaluation *innermostEval = nestedEval;
-  for ([[maybe_unused]] auto iv : ivs) {
+  for (std::size_t i{0}; i < ivs.size(); ++i) {
     if (innermostEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
       // OpenMP specifies DO CONCURRENT only with the `!omp loop` construct.
       // Will need to add special cases for this combination.
@@ -2364,7 +2520,8 @@ static void genCanonicalLoopNest(
     mlir::Value cli = newcli.getResult();
     clis.push_back(cli);
 
-    innermostEval = &*std::next(innermostEval->getNestedEvaluations().begin());
+    if (i + 1 < ivs.size())
+      innermostEval = getNestedDoConstruct(*innermostEval);
   }
 
   // Step 2: Create nested canoncial loops
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 0ef824df8455b..d75e3fb2608bb 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -717,7 +717,8 @@ pft::Evaluation *getNestedDoConstruct(pft::Evaluation &eval) {
     // constructs between the directive and the actual do-loop nest.
     if (nested.getIf<parser::OpenMPConstruct>())
       return getNestedDoConstruct(nested);
-    assert(false && "Unexpected construct in the nested evaluations");
+    // Skip valid intervening code in imperfect loop nests
+    continue;
   }
   llvm_unreachable("Expected do loop to be in the nested evaluations");
 }
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index 6c816d71b35a8..a10d6e4fc93e1 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -111,6 +111,49 @@ class AssociatedLoopChecker {
   std::int64_t level_;
   std::map<std::string, std::int64_t> constructNamesAndLevels_;
 };
+
+/// Visitor that detects an ordered directive with a doacross clause
+/// (or the pre-5.2 depend(sink/source) equivalent).
+/// Does not descend into nested OpenMP block or loop constructs, since
+/// doacross directives inside them bind to an inner worksharing-loop
+/// region, not the one being checked.
+struct DoacrossFinder {
+  bool found{false};
+  template <typename T> bool Pre(const T &) { return !found; }
+  template <typename T> void Post(const T &) {}
+
+  // Stop descent into nested OpenMP regions that create new binding contexts.
+  bool Pre(const parser::OmpBlockConstruct &) { return false; }
+  bool Pre(const parser::OpenMPLoopConstruct &) { return false; }
+
+  void Post(const parser::OpenMPSimpleStandaloneConstruct &x) {
+    if (found) {
+      return;
+    }
+    if (x.v.DirId() != llvm::omp::Directive::OMPD_ordered) {
+      return;
+    }
+    for (const auto &clause : x.v.Clauses().v) {
+      if (std::holds_alternative<parser::OmpClause::Doacross>(clause.u)) {
+        found = true;
+        return;
+      }
+      if (const auto *depend{
+              std::get_if<parser::OmpClause::Depend>(&clause.u)}) {
+        if (std::holds_alternative<parser::OmpDoacross>(depend->v.u)) {
+          found = true;
+          return;
+        }
+      }
+    }
+  }
+};
+
+bool ContainsDoacrossDirective(const parser::Block &block) {
+  DoacrossFinder finder;
+  parser::Walk(block, finder);
+  return finder.found;
+}
 } // namespace
 
 namespace Fortran::semantics {
@@ -326,6 +369,13 @@ void OmpStructureChecker::CheckNestedConstruct(
     // Check requirements on nest depth.
     auto [needDepth, needPerfect]{
         GetAffectedNestDepthWithReason(beginSpec, version)};
+
+    // In OpenMP 5.2+, perfect nesting is only required for doacross loop
+    // nests (those whose body contains ordered doacross directives).
+    if (!needPerfect && version >= 52 && ContainsDoacrossDirective(body)) {
+      needPerfect = true;
+    }
+
     auto &[haveSema, havePerf]{sequence.depth()};
 
     auto haveDepth{needPerfect ? havePerf : haveSema};
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index 51dd08d0924b1..edff3cb3e9cfd 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -990,10 +990,19 @@ std::pair<WithReason<int64_t>, bool> GetAffectedNestDepthWithReason(
       oreason = Reason();
     }
     if (ccount < ocount) {
-      // `ocount` cannot be std::nullopt here (C++ std guarantee).
-      return {{ocount.value_or(1), std::move(oreason)}, true};
-    }
-    return {{ccount.value_or(1), std::move(creason)}, true};
+      // Prior to 5.2, ordered(n) requires perfect nesting unconditionally.
+      // In 5.2+, perfect nesting is required only for doacross (checked later
+      // with ContainsDoacrossDirective).
+      return {{ocount.value_or(1), std::move(oreason)}, version < 52};
+    }
+    // Prior to 5.2, ordered(n) requires perfect nesting.
+    // In 5.2+, only doacross nests require it (checked separately).
+    // CLN relaxation for collapse is applied retroactively for all versions.
+    bool needPerfect{false};
+    if (version < 52) {
+      needPerfect = ocount.has_value();
+    }
+    return {{ccount.value_or(1), std::move(creason)}, needPerfect};
   }
 
   if (IsLoopTransforming(dir)) {
diff --git a/flang/test/Lower/OpenMP/collapse-imperfect-nest.f90 b/flang/test/Lower/OpenMP/collapse-imperfect-nest.f90
new file mode 100644
index 0000000000000..9fe0582ce66cd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/collapse-imperfect-nest.f90
@@ -0,0 +1,604 @@
+! Test lowering of imperfectly nested collapse loops (CLN relaxation).
+! Intervening code is guarded by IV comparisons to restore correct
+! execution frequency and ordering within the flat omp.loop_nest body.
+
+! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: func.func @_QPcollapse2_imperfect
+subroutine collapse2_imperfect(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j
+
+  !$omp do collapse(2)
+  do i = 1, n
+    x = x + 1
+    do j = 1, n
+      x = x + j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]]) : i32 =
+! CHECK-SAME:      (%{{.*}}, %[[LB_J:.*]]) to
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! Guard: j == lower_bound (before code executes once per i)
+! CHECK:           %[[CMP:.*]] = arith.cmpi eq, %[[J]], %[[LB_J]] : i32
+! CHECK:           fir.if %[[CMP]] {
+! Intervening code: x = x + 1
+! CHECK:             %[[X1:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:             %[[C1:.*]] = arith.constant 1 : i32
+! CHECK:             %[[ADD1:.*]] = arith.addi %[[X1]], %[[C1]] : i32
+! CHECK:             hlfir.assign %[[ADD1]] to %{{.*}} : i32, !fir.ref<i32>
+! CHECK:           }
+! Innermost body: x = x + j
+! CHECK:           %[[X2:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:           %[[JVAL:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:           %[[ADD2:.*]] = arith.addi %[[X2]], %[[JVAL]] : i32
+! CHECK:           hlfir.assign %[[ADD2]] to %{{.*}} : i32, !fir.ref<i32>
+! CHECK:           omp.yield
+
+! CHECK-LABEL: func.func @_QPcollapse3_imperfect
+subroutine collapse3_imperfect(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j, k
+
+  !$omp do collapse(3)
+  do i = 1, n
+    x = x + i
+    do j = 1, n
+      x = x + j
+      do k = 1, n
+        x = x + k
+      end do
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I3:.*]], %[[J3:.*]], %[[K3:.*]]) : i32 =
+! CHECK-SAME:      (%{{.*}}, %[[LB_J3:.*]], %[[LB_K3:.*]]) to
+! CHECK:           hlfir.assign %[[I3]]
+! CHECK:           hlfir.assign %[[J3]]
+! CHECK:           hlfir.assign %[[K3]]
+! Guard: j == lb_j AND k == lb_k (level 0 before code, once per i)
+! CHECK:           %[[CMP_J:.*]] = arith.cmpi eq, %[[J3]], %[[LB_J3]] : i32
+! CHECK:           %[[CMP_K1:.*]] = arith.cmpi eq, %[[K3]], %[[LB_K3]] : i32
+! CHECK:           %[[AND1:.*]] = arith.andi %[[CMP_J]], %[[CMP_K1]] : i1
+! CHECK:           fir.if %[[AND1]] {
+! Intervening code at level 0: x = x + i
+! CHECK:             %[[XI:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:             %[[IVAL:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:             %[[ADDI:.*]] = arith.addi %[[XI]], %[[IVAL]] : i32
+! CHECK:             hlfir.assign %[[ADDI]] to %{{.*}} : i32, !fir.ref<i32>
+! CHECK:           }
+! Guard: k == lb_k (level 1 before code, once per (i,j))
+! CHECK:           %[[CMP_K2:.*]] = arith.cmpi eq, %[[K3]], %[[LB_K3]] : i32
+! CHECK:           fir.if %[[CMP_K2]] {
+! Intervening code at level 1: x = x + j
+! CHECK:             %[[XJ:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:             %[[JVAL3:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:             %[[ADDJ:.*]] = arith.addi %[[XJ]], %[[JVAL3]] : i32
+! CHECK:             hlfir.assign %[[ADDJ]] to %{{.*}} : i32, !fir.ref<i32>
+! CHECK:           }
+! Innermost body: x = x + k
+! CHECK:           %[[XK:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:           %[[KVAL:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:           %[[ADDK:.*]] = arith.addi %[[XK]], %[[KVAL]] : i32
+! CHECK:           hlfir.assign %[[ADDK]] to %{{.*}} : i32, !fir.ref<i32>
+! CHECK:           omp.yield
+
+! CHECK-LABEL: func.func @_QPcollapse2_both_sides
+subroutine collapse2_both_sides(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j
+
+  !$omp simd collapse(2)
+  do i = 1, n
+    x = x + 1
+    do j = 1, n
+      x = x + j
+    end do
+    call ext_sub(x)
+  end do
+  !$omp end simd
+end subroutine
+
+! CHECK:       omp.simd
+! CHECK-NEXT:    omp.loop_nest (%[[I4:.*]], %[[J4:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %[[LB_J4:[^)]*]]) to (%{{[^)]*}}, %[[UB_J4:[^)]*]])
+! CHECK:           hlfir.assign %[[I4]]
+! CHECK:           hlfir.assign %[[J4]]
+! Guard: j == lower_bound (before code)
+! CHECK:           %[[CMP_B:.*]] = arith.cmpi eq, %[[J4]], %[[LB_J4]] : i32
+! CHECK:           fir.if %[[CMP_B]] {
+! Intervening code before inner loop: x = x + 1
+! CHECK:             %[[XB:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:             %[[CB:.*]] = arith.constant 1 : i32
+! CHECK:             %[[ADDB:.*]] = arith.addi %[[XB]], %[[CB]] : i32
+! CHECK:             hlfir.assign %[[ADDB]] to %{{.*}} : i32, !fir.ref<i32>
+! CHECK:           }
+! Innermost body: x = x + j
+! CHECK:           %[[XIN:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:           %[[JIN:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:           %[[ADDIN:.*]] = arith.addi %[[XIN]], %[[JIN]] : i32
+! CHECK:           hlfir.assign %[[ADDIN]] to %{{.*}} : i32, !fir.ref<i32>
+! Guard: j == upper_bound (after code)
+! CHECK:           %[[CMP_A:.*]] = arith.cmpi eq, %[[J4]], %[[UB_J4]] : i32
+! CHECK:           fir.if %[[CMP_A]] {
+! Intervening code after inner loop: call ext_sub(x)
+! CHECK:             fir.call @_QPext_sub
+! CHECK:           }
+! CHECK:           omp.yield
+
+! Test collapse(3) with both before and after code at multiple levels.
+! CHECK-LABEL: func.func @_QPcollapse3_both_sides
+subroutine collapse3_both_sides(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j, k
+
+  !$omp do collapse(3)
+  do i = 1, n
+    x = x + i
+    do j = 1, n
+      x = x + j
+      do k = 1, n
+        x = x + k
+      end do
+      x = x - j
+    end do
+    x = x - i
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]], %[[K:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %[[LB_J:[^)]*]], %[[LB_K:[^)]*]]) to (%{{[^)]*}}, %[[UB_J:[^)]*]], %[[UB_K:[^)]*]])
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! CHECK:           hlfir.assign %[[K]]
+!
+! --- "before" guards (outermost level first) ---
+!
+! Guard level 0 before: j == lb_j AND k == lb_k (once per i)
+! CHECK:           %[[CJ1:.*]] = arith.cmpi eq, %[[J]], %[[LB_J]] : i32
+! CHECK:           %[[CK1:.*]] = arith.cmpi eq, %[[K]], %[[LB_K]] : i32
+! CHECK:           %[[AND1:.*]] = arith.andi %[[CJ1]], %[[CK1]] : i1
+! CHECK:           fir.if %[[AND1]] {
+! CHECK:             arith.addi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! Guard level 1 before: k == lb_k (once per (i,j))
+! CHECK:           %[[CK2:.*]] = arith.cmpi eq, %[[K]], %[[LB_K]] : i32
+! CHECK:           fir.if %[[CK2]] {
+! CHECK:             arith.addi
+! CHECK:             hlfir.assign
+! CHECK:           }
+!
+! --- innermost body: x = x + k ---
+! CHECK:           arith.addi
+! CHECK:           hlfir.assign
+!
+! --- "after" guards (innermost level first) ---
+!
+! Guard level 1 after: k == ub_k (once per (i,j))
+! CHECK:           %[[CK3:.*]] = arith.cmpi eq, %[[K]], %[[UB_K]] : i32
+! CHECK:           fir.if %[[CK3]] {
+! CHECK:             arith.subi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! Guard level 0 after: j == ub_j AND k == ub_k (once per i)
+! CHECK:           %[[CJ2:.*]] = arith.cmpi eq, %[[J]], %[[UB_J]] : i32
+! CHECK:           %[[CK4:.*]] = arith.cmpi eq, %[[K]], %[[UB_K]] : i32
+! CHECK:           %[[AND2:.*]] = arith.andi %[[CJ2]], %[[CK4]] : i1
+! CHECK:           fir.if %[[AND2]] {
+! CHECK:             arith.subi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! CHECK:           omp.yield
+
+! Test collapse(4) with imperfect nesting at some levels and perfectly nested
+! innermost loops. Level 0 (i->j) has before+after, level 1 (j->k) has before
+! only, level 2 (k->l) is perfectly nested. This exercises skipping empty levels.
+! CHECK-LABEL: func.func @_QPcollapse4_mixed
+subroutine collapse4_mixed(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j, k, l
+
+  !$omp do collapse(4)
+  do i = 1, n
+    x = x + i
+    do j = 1, n
+      x = x + j
+      do k = 1, n
+        do l = 1, n
+          x = x + l
+        end do
+      end do
+    end do
+    x = x - i
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]], %[[K:.*]], %[[L:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %[[LB_J:[^)]*]], %[[LB_K:[^)]*]], %[[LB_L:[^)]*]]) to (%{{[^)]*}}, %[[UB_J:[^)]*]], %[[UB_K:[^)]*]], %[[UB_L:[^)]*]])
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! CHECK:           hlfir.assign %[[K]]
+! CHECK:           hlfir.assign %[[L]]
+!
+! --- "before" guards ---
+!
+! Guard level 0 before: j == lb_j AND k == lb_k AND l == lb_l (once per i)
+! CHECK:           %[[CJ1:.*]] = arith.cmpi eq, %[[J]], %[[LB_J]] : i32
+! CHECK:           %[[CK1:.*]] = arith.cmpi eq, %[[K]], %[[LB_K]] : i32
+! CHECK:           %[[A1:.*]] = arith.andi %[[CJ1]], %[[CK1]] : i1
+! CHECK:           %[[CL1:.*]] = arith.cmpi eq, %[[L]], %[[LB_L]] : i32
+! CHECK:           %[[A2:.*]] = arith.andi %[[A1]], %[[CL1]] : i1
+! CHECK:           fir.if %[[A2]] {
+! CHECK:             arith.addi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! Guard level 1 before: k == lb_k AND l == lb_l (once per (i,j))
+! CHECK:           %[[CK2:.*]] = arith.cmpi eq, %[[K]], %[[LB_K]] : i32
+! CHECK:           %[[CL2:.*]] = arith.cmpi eq, %[[L]], %[[LB_L]] : i32
+! CHECK:           %[[A3:.*]] = arith.andi %[[CK2]], %[[CL2]] : i1
+! CHECK:           fir.if %[[A3]] {
+! CHECK:             arith.addi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! Level 2 (k->l) is perfectly nested: no guard emitted.
+!
+! --- innermost body: x = x + l ---
+! CHECK:           arith.addi
+! CHECK:           hlfir.assign
+!
+! --- "after" guards (innermost first) ---
+!
+! Level 2 after: empty (perfectly nested), no guard emitted.
+! Level 1 after: empty, no guard emitted.
+! Guard level 0 after: j == ub_j AND k == ub_k AND l == ub_l (once per i)
+! CHECK:           %[[CJ2:.*]] = arith.cmpi eq, %[[J]], %[[UB_J]] : i32
+! CHECK:           %[[CK3:.*]] = arith.cmpi eq, %[[K]], %[[UB_K]] : i32
+! CHECK:           %[[A4:.*]] = arith.andi %[[CJ2]], %[[CK3]] : i1
+! CHECK:           %[[CL3:.*]] = arith.cmpi eq, %[[L]], %[[UB_L]] : i32
+! CHECK:           %[[A5:.*]] = arith.andi %[[A4]], %[[CL3]] : i1
+! CHECK:           fir.if %[[A5]] {
+! CHECK:             arith.subi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! CHECK:           omp.yield
+
+! Test collapse(2) with only after-code (no before-code). Exercises the path
+! where levels[i].before.empty() is true and the "before" loop is entirely skipped.
+! CHECK-LABEL: func.func @_QPcollapse2_after_only
+subroutine collapse2_after_only(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j
+
+  !$omp do collapse(2)
+  do i = 1, n
+    do j = 1, n
+      x = x + j
+    end do
+    x = x - i
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %{{[^)]*}}) to (%{{[^)]*}}, %[[UB_J:[^)]*]])
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! No "before" guard emitted (level 0 before is empty).
+! Innermost body: x = x + j
+! CHECK:           arith.addi
+! CHECK:           hlfir.assign
+! Guard: j == upper_bound (after code)
+! CHECK:           %[[CMP:.*]] = arith.cmpi eq, %[[J]], %[[UB_J]] : i32
+! CHECK:           fir.if %[[CMP]] {
+! CHECK:             arith.subi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! CHECK:           omp.yield
+
+! Test collapse(2) with multiple statements inside a single guard. Verifies
+! that all evals in level.before land inside the same fir.if block.
+! CHECK-LABEL: func.func @_QPcollapse2_multi_stmt
+subroutine collapse2_multi_stmt(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j
+
+  !$omp do collapse(2)
+  do i = 1, n
+    x = x + 1
+    x = x + i
+    do j = 1, n
+      x = x + j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %[[LB_J:[^)]*]]) to
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! Guard: j == lower_bound (before code, multiple statements in one guard)
+! CHECK:           %[[CMP:.*]] = arith.cmpi eq, %[[J]], %[[LB_J]] : i32
+! CHECK:           fir.if %[[CMP]] {
+! First intervening statement: x = x + 1
+! CHECK:             arith.addi
+! CHECK:             hlfir.assign
+! Second intervening statement: x = x + i
+! CHECK:             arith.addi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! Innermost body: x = x + j
+! CHECK:           arith.addi
+! CHECK:           hlfir.assign
+! CHECK:           omp.yield
+
+! Test collapse(2) with non-unit lower bound on inner loop. Verifies the guard
+! compares against the actual loop lower bound operand (3, not 1).
+! CHECK-LABEL: func.func @_QPcollapse2_nonunit_lb
+subroutine collapse2_nonunit_lb(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j
+
+  !$omp do collapse(2)
+  do i = 1, n
+    x = x + i
+    do j = 3, n
+      x = x + j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %[[LB_J:[^)]*]]) to
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! Guard: j == lb_j (lb_j is 3, not 1)
+! CHECK:           %[[CMP:.*]] = arith.cmpi eq, %[[J]], %[[LB_J]] : i32
+! CHECK:           fir.if %[[CMP]] {
+! CHECK:             arith.addi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! Innermost body: x = x + j
+! CHECK:           arith.addi
+! CHECK:           hlfir.assign
+! CHECK:           omp.yield
+
+! Test collapse(3) with after-only at level 0 and before-only at level 1.
+! Exercises the independent skip logic at each level in both emission loops.
+! CHECK-LABEL: func.func @_QPcollapse3_mixed_sides
+subroutine collapse3_mixed_sides(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j, k
+
+  !$omp do collapse(3)
+  do i = 1, n
+    do j = 1, n
+      x = x + j
+      do k = 1, n
+        x = x + k
+      end do
+    end do
+    x = x - i
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]], %[[K:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %{{[^)]*}}, %[[LB_K:[^)]*]]) to (%{{[^)]*}}, %[[UB_J:[^)]*]], %[[UB_K:[^)]*]])
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! CHECK:           hlfir.assign %[[K]]
+! Level 0 before: empty (skipped).
+! Guard level 1 before: k == lb_k (once per (i,j))
+! CHECK:           %[[CK:.*]] = arith.cmpi eq, %[[K]], %[[LB_K]] : i32
+! CHECK:           fir.if %[[CK]] {
+! CHECK:             arith.addi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! Innermost body: x = x + k
+! CHECK:           arith.addi
+! CHECK:           hlfir.assign
+! Level 1 after: empty (skipped).
+! Guard level 0 after: j == ub_j AND k == ub_k (once per i)
+! CHECK:           %[[CJ:.*]] = arith.cmpi eq, %[[J]], %[[UB_J]] : i32
+! CHECK:           %[[CK2:.*]] = arith.cmpi eq, %[[K]], %[[UB_K]] : i32
+! CHECK:           %[[AND:.*]] = arith.andi %[[CJ]], %[[CK2]] : i1
+! CHECK:           fir.if %[[AND]] {
+! CHECK:             arith.subi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! CHECK:           omp.yield
+
+! Test collapse(2) with non-unit positive step and after-code.
+! The after guard must compare iv against the last executed value
+! (lb + ((ub - lb) / step) * step), not the upper bound directly.
+! For do j = 1, 10, 4: last_iv = 1 + ((10-1)/4)*4 = 1 + 8 = 9.
+! CHECK-LABEL: func.func @_QPcollapse2_nonunit_step_after
+subroutine collapse2_nonunit_step_after(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j
+
+  !$omp do collapse(2)
+  do i = 1, n
+    do j = 1, 10, 4
+      x = x + j
+    end do
+    x = x - i
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %[[LB_J:[^)]*]]) to (%{{[^)]*}}, %[[UB_J:[^)]*]]) inclusive step (%{{[^)]*}}, %[[STEP_J:[^)]*]])
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! Innermost body: x = x + j
+! CHECK:           arith.addi
+! CHECK:           hlfir.assign
+! After guard: compute last_iv = lb + ((ub - lb) / step) * step
+! CHECK:           %[[RANGE:.*]] = arith.subi %[[UB_J]], %[[LB_J]] : i32
+! CHECK:           %[[DIV:.*]] = arith.divsi %[[RANGE]], %[[STEP_J]] : i32
+! CHECK:           %[[MUL:.*]] = arith.muli %[[DIV]], %[[STEP_J]] : i32
+! CHECK:           %[[LAST:.*]] = arith.addi %[[LB_J]], %[[MUL]] : i32
+! CHECK:           %[[CMP:.*]] = arith.cmpi eq, %[[J]], %[[LAST]] : i32
+! CHECK:           fir.if %[[CMP]] {
+! CHECK:             arith.subi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! CHECK:           omp.yield
+
+! Test collapse(2) with negative step and after-code.
+! For do j = 10, 1, -4: last_iv = 10 + ((1-10)/(-4))*(-4) = 10 + (2*-4) = 2.
+! CHECK-LABEL: func.func @_QPcollapse2_negative_step_after
+subroutine collapse2_negative_step_after(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j
+
+  !$omp do collapse(2)
+  do i = 1, n
+    do j = 10, 1, -4
+      x = x + j
+    end do
+    x = x - i
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %[[LB_J:[^)]*]]) to (%{{[^)]*}}, %[[UB_J:[^)]*]]) inclusive step (%{{[^)]*}}, %[[STEP_J:[^)]*]])
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! Innermost body: x = x + j
+! CHECK:           arith.addi
+! CHECK:           hlfir.assign
+! After guard: compute last_iv for negative step
+! CHECK:           %[[RANGE:.*]] = arith.subi %[[UB_J]], %[[LB_J]] : i32
+! CHECK:           %[[DIV:.*]] = arith.divsi %[[RANGE]], %[[STEP_J]] : i32
+! CHECK:           %[[MUL:.*]] = arith.muli %[[DIV]], %[[STEP_J]] : i32
+! CHECK:           %[[LAST:.*]] = arith.addi %[[LB_J]], %[[MUL]] : i32
+! CHECK:           %[[CMP:.*]] = arith.cmpi eq, %[[J]], %[[LAST]] : i32
+! CHECK:           fir.if %[[CMP]] {
+! CHECK:             arith.subi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! CHECK:           omp.yield
+
+! Test collapse(3) with non-unit step on the middle loop (not innermost).
+! For do j = 1, n, 3: last_iv = lb + ((ub - lb) / step) * step (runtime).
+! CHECK-LABEL: func.func @_QPcollapse3_nonunit_step_middle
+subroutine collapse3_nonunit_step_middle(n, x)
+  integer, intent(in) :: n
+  integer, intent(inout) :: x
+  integer :: i, j, k
+
+  !$omp do collapse(3)
+  do i = 1, n
+    do j = 1, n, 3
+      x = x + j
+      do k = 1, n
+        x = x + k
+      end do
+    end do
+    x = x - i
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]], %[[K:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %[[LB_J:[^)]*]], %[[LB_K:[^)]*]]) to (%{{[^)]*}}, %[[UB_J:[^)]*]], %[[UB_K:[^)]*]]) inclusive step (%{{[^)]*}}, %[[STEP_J:[^)]*]], %{{[^)]*}})
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! CHECK:           hlfir.assign %[[K]]
+! Guard level 1 before: k == lb_k (once per (i,j))
+! CHECK:           %[[CK1:.*]] = arith.cmpi eq, %[[K]], %[[LB_K]] : i32
+! CHECK:           fir.if %[[CK1]] {
+! CHECK:             arith.addi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! Innermost body: x = x + k
+! CHECK:           arith.addi
+! CHECK:           hlfir.assign
+! Guard level 0 after: must compute last_iv for j (non-unit step) AND k == ub_k
+! CHECK:           %[[RANGE:.*]] = arith.subi %[[UB_J]], %[[LB_J]] : i32
+! CHECK:           %[[DIV:.*]] = arith.divsi %[[RANGE]], %[[STEP_J]] : i32
+! CHECK:           %[[MUL:.*]] = arith.muli %[[DIV]], %[[STEP_J]] : i32
+! CHECK:           %[[LASTJ:.*]] = arith.addi %[[LB_J]], %[[MUL]] : i32
+! CHECK:           %[[CJ:.*]] = arith.cmpi eq, %[[J]], %[[LASTJ]] : i32
+! CHECK:           %[[CK2:.*]] = arith.cmpi eq, %[[K]], %[[UB_K]] : i32
+! CHECK:           %[[AND:.*]] = arith.andi %[[CJ]], %[[CK2]] : i1
+! CHECK:           fir.if %[[AND]] {
+! CHECK:             arith.subi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! CHECK:           omp.yield
+
+! Test collapse(2) with a dynamic (runtime) step value.
+! The step is not a compile-time constant, so the last_iv computation
+! cannot be folded away and must remain as arith ops in the IR.
+! CHECK-LABEL: func.func @_QPcollapse2_dynamic_step_after
+subroutine collapse2_dynamic_step_after(n, s, x)
+  integer, intent(in) :: n, s
+  integer, intent(inout) :: x
+  integer :: i, j
+
+  !$omp do collapse(2)
+  do i = 1, n
+    do j = 1, n, s
+      x = x + j
+    end do
+    x = x - i
+  end do
+  !$omp end do
+end subroutine
+
+! CHECK:       omp.wsloop
+! CHECK-NEXT:    omp.loop_nest (%[[I:.*]], %[[J:.*]]) : i32 =
+! CHECK-SAME:      (%{{[^)]*}}, %[[LB_J:[^)]*]]) to (%{{[^)]*}}, %[[UB_J:[^)]*]]) inclusive step (%{{[^)]*}}, %[[STEP_J:[^)]*]])
+! CHECK:           hlfir.assign %[[I]]
+! CHECK:           hlfir.assign %[[J]]
+! Innermost body: x = x + j
+! CHECK:           arith.addi
+! CHECK:           hlfir.assign
+! After guard: dynamic step forces last_iv computation to stay in IR
+! CHECK:           %[[RANGE:.*]] = arith.subi %[[UB_J]], %[[LB_J]] : i32
+! CHECK:           %[[DIV:.*]] = arith.divsi %[[RANGE]], %[[STEP_J]] : i32
+! CHECK:           %[[MUL:.*]] = arith.muli %[[DIV]], %[[STEP_J]] : i32
+! CHECK:           %[[LAST:.*]] = arith.addi %[[LB_J]], %[[MUL]] : i32
+! CHECK:           %[[CMP:.*]] = arith.cmpi eq, %[[J]], %[[LAST]] : i32
+! CHECK:           fir.if %[[CMP]] {
+! CHECK:             arith.subi
+! CHECK:             hlfir.assign
+! CHECK:           }
+! CHECK:           omp.yield
diff --git a/flang/test/Semantics/OpenMP/do-collapse.f90 b/flang/test/Semantics/OpenMP/do-collapse.f90
index 86354d6a61a31..29839a7e0b93d 100644
--- a/flang/test/Semantics/OpenMP/do-collapse.f90
+++ b/flang/test/Semantics/OpenMP/do-collapse.f90
@@ -3,7 +3,7 @@
 ! 2.7.1 Collapse Clause
 program omp_doCollapse
   integer:: i,j
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !ERROR: This construct requires a nest of depth 3, but the associated nest is a nest of depth 2
   !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do collapse(3)
   do i = 1,10
@@ -15,7 +15,7 @@ program omp_doCollapse
 
   do i = 1,10
     do j = 1, 10
-      !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+      !ERROR: This construct requires a nest of depth 2, but the associated nest is a nest of depth 1
       !BECAUSE: COLLAPSE clause was specified with argument 2
       !$omp do collapse(2)
       do k = 1, 10
@@ -25,7 +25,7 @@ program omp_doCollapse
     end do
   end do
 
-  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !ERROR: This construct requires a nest of depth 2, but the associated nest is a nest of depth 1
   !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp parallel do collapse(2)
     do i = 1, 3
@@ -35,7 +35,7 @@ program omp_doCollapse
       end do
     end do
 
-  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !ERROR: This construct requires a nest of depth 2, but the associated nest is a nest of depth 1
   !BECAUSE: COLLAPSE clause was specified with argument 2
   !ERROR: At most one COLLAPSE clause can appear on the SIMD directive
   !$omp simd collapse(2) collapse(1)
diff --git a/flang/test/Semantics/OpenMP/do-concurrent-collapse-60.f90 b/flang/test/Semantics/OpenMP/do-concurrent-collapse-60.f90
index 4ba8e71a26323..75351cdf7190c 100644
--- a/flang/test/Semantics/OpenMP/do-concurrent-collapse-60.f90
+++ b/flang/test/Semantics/OpenMP/do-concurrent-collapse-60.f90
@@ -3,7 +3,7 @@
 subroutine f
   integer :: i
 
-  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !ERROR: This construct requires a nest of depth 2, but the associated nest is a nest of depth 1
   !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp parallel do collapse(2)
   do i = 1, 1
@@ -42,7 +42,7 @@ subroutine f
     print *, j
   end do
 
-  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !ERROR: This construct requires a nest of depth 2, but the associated nest is a nest of depth 1
   !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp loop collapse(2)
   do i = 1, 1
diff --git a/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90 b/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
index 7cd10518d845a..0a6906b0e2e63 100644
--- a/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
+++ b/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
@@ -1,7 +1,7 @@
 !RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 integer :: i, j
-! ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+! ERROR: This construct requires a nest of depth 2, but the associated nest is a nest of depth 1
 ! BECAUSE: COLLAPSE clause was specified with argument 2
 !$omp parallel do collapse(2)
 do i = 1, 1
@@ -33,7 +33,7 @@
   print *, j
 end do
 
-! ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+! ERROR: This construct requires a nest of depth 2, but the associated nest is a nest of depth 1
 ! BECAUSE: COLLAPSE clause was specified with argument 2
 !$omp loop collapse(2)
 do i = 1, 1
diff --git a/flang/test/Semantics/OpenMP/do08.f90 b/flang/test/Semantics/OpenMP/do08.f90
index 300485f067b1d..24755022f0296 100644
--- a/flang/test/Semantics/OpenMP/do08.f90
+++ b/flang/test/Semantics/OpenMP/do08.f90
@@ -7,11 +7,8 @@ program omp
   logical cond(10,10,10)
   cond = .false.
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 1
-  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
-    !BECAUSE: This code prevents perfect nesting
     !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
     if (i .lt. 1) cycle
     do j = 0, 10
@@ -22,12 +19,9 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
-  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     do j = 0, 10
-      !BECAUSE: This code prevents perfect nesting
       !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
       if (i .lt. 1) cycle
       do k  = 0, 10
@@ -37,11 +31,8 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
-  !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp do  collapse(2)
   do i = 0, 10
-    !BECAUSE: This code prevents perfect nesting
     !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
     if (i .lt. 1) cycle
     do j = 0, 10
@@ -53,11 +44,8 @@ program omp
   !$omp end do
 
 
-  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
-  !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp do  collapse(2)
   foo: do i = 0, 10
-    !BECAUSE: This code prevents perfect nesting
     !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
     if (i .lt. 1) cycle foo
     do j = 0, 10
@@ -69,12 +57,9 @@ program omp
   !$omp end do
 
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
-  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do collapse(3)
   do 60 i=2,200,2
     do j=1,10
-      !BECAUSE: This code prevents perfect nesting
       !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
       if (i == 100) cycle
       do k = 1, 10
diff --git a/flang/test/Semantics/OpenMP/do10.f90 b/flang/test/Semantics/OpenMP/do10.f90
index 1878864a4a5db..1fae5e098dc56 100644
--- a/flang/test/Semantics/OpenMP/do10.f90
+++ b/flang/test/Semantics/OpenMP/do10.f90
@@ -14,7 +14,7 @@ program omp_do
   end do
   !$omp end do
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !ERROR: This construct requires a nest of depth 3, but the associated nest is a nest of depth 2
   !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do collapse(3)
   do i = 1, 10
diff --git a/flang/test/Semantics/OpenMP/do13.f90 b/flang/test/Semantics/OpenMP/do13.f90
index 6d5e799e951b0..dd9f4be03e7e1 100644
--- a/flang/test/Semantics/OpenMP/do13.f90
+++ b/flang/test/Semantics/OpenMP/do13.f90
@@ -5,7 +5,7 @@
 program omp
   integer i, j, k
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 1
+  !ERROR: This construct requires a nest of depth 3, but the associated nest is a nest of depth 1
   !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
@@ -20,7 +20,7 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !ERROR: This construct requires a nest of depth 3, but the associated nest is a nest of depth 2
   !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
@@ -35,7 +35,7 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !ERROR: This construct requires a nest of depth 2, but the associated nest is a nest of depth 1
   !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp do  collapse(2)
   do i = 0, 10
@@ -51,7 +51,7 @@ program omp
   !$omp end do
 
 
-  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !ERROR: This construct requires a nest of depth 2, but the associated nest is a nest of depth 1
   !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp do  collapse(2)
   foo: do i = 0, 10
@@ -67,7 +67,7 @@ program omp
   !$omp end do
 
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !ERROR: This construct requires a nest of depth 3, but the associated nest is a nest of depth 2
   !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do collapse(3)
   do 60 i=1,10
diff --git a/flang/test/Semantics/OpenMP/do15.f90 b/flang/test/Semantics/OpenMP/do15.f90
index 00baa0c431c5f..96c195b2d1306 100644
--- a/flang/test/Semantics/OpenMP/do15.f90
+++ b/flang/test/Semantics/OpenMP/do15.f90
@@ -5,11 +5,8 @@
 program omp
   integer i, j, k
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 1
-  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
-    !BECAUSE: This code prevents perfect nesting
     if (i .lt. 1) then
       !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
       cycle
@@ -22,12 +19,9 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
-  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     do j = 0, 10
-      !BECAUSE: This code prevents perfect nesting
       if (i .lt. 1) then
         !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
         cycle
@@ -56,12 +50,9 @@ program omp
   !$omp end do
 
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
-  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   foo: do i = 0, 10
     foo1: do j = 0, 10
-      !BECAUSE: This code prevents perfect nesting
       if (i .lt. 1) then
         !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
         cycle foo
diff --git a/flang/test/Semantics/OpenMP/do16.f90 b/flang/test/Semantics/OpenMP/do16.f90
index 35d94b17f3c68..2a871156de75f 100644
--- a/flang/test/Semantics/OpenMP/do16.f90
+++ b/flang/test/Semantics/OpenMP/do16.f90
@@ -5,11 +5,8 @@
 program omp
   integer i, j, k
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 1
-  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
-    !BECAUSE: This code prevents perfect nesting
     select case (i)
     case(1)
       !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
@@ -23,12 +20,9 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
-  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     do j = 0, 10
-      !BECAUSE: This code prevents perfect nesting
       select case (i)
       case(1)
         !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
diff --git a/flang/test/Semantics/OpenMP/do22.f90 b/flang/test/Semantics/OpenMP/do22.f90
index 2ced881a2af8b..818de56aa1093 100644
--- a/flang/test/Semantics/OpenMP/do22.f90
+++ b/flang/test/Semantics/OpenMP/do22.f90
@@ -4,11 +4,9 @@
 subroutine do_imperfectly_nested_before
   integer i, j
 
-  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
-  !BECAUSE: COLLAPSE clause was specified with argument 2
+  ! Valid: print is allowed as CLN intervening code with collapse
   !$omp do collapse(2)
   do i = 1, 10
-    !BECAUSE: This code prevents perfect nesting
     print *, i
     do j = 1, 10
       print *, i, j
@@ -21,15 +19,134 @@ subroutine do_imperfectly_nested_before
 subroutine do_imperfectly_nested_behind
   integer i, j
 
-  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
-  !BECAUSE: COLLAPSE clause was specified with argument 2
+  ! Valid: print is allowed as CLN intervening code with collapse
+  !$omp do collapse(2)
+  do i = 1, 10
+    do j = 1, 10
+      print *, i, j
+    end do
+    print *, i
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_imperfectly_nested_scalar_assign
+  integer i, j, x
+
+  ! Valid: scalar assignment is allowed as CLN intervening code with collapse
+  !$omp do collapse(2)
+  do i = 1, 10
+    x = i + 1
+    do j = 1, 10
+      print *, i, j, x
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_imperfectly_nested_call
+  integer i, j
+
+  ! Valid: subroutine call is allowed as CLN intervening code with collapse
   !$omp do collapse(2)
   do i = 1, 10
+    call sub(i)
+    do j = 1, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_imperfectly_nested_multiple
+  integer i, j, x
+
+  ! Valid: multiple scalar statements are allowed as CLN intervening code
+  !$omp do collapse(2)
+  do i = 1, 10
+    x = i * 2
+    print *, x
+    call sub(x)
+    do j = 1, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_imperfect_collapse_bare_ordered
+  integer i, j, x
+
+  ! Valid: bare ORDERED does not require a perfect nest.
+  !$omp do collapse(2) ordered
+  do i = 1, 10
+    x = 0
+    do j = 1, 10
+      !$omp ordered
+      print *, i, j, x
+      !$omp end ordered
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_imperfect_ordered_requires_perfect
+  integer i, j
+
+  ! ordered(2) still requires perfect nesting at default OpenMP version
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: ORDERED clause was specified with argument 2
+  !$omp do ordered(2)
+  do i = 1, 10
+    !BECAUSE: This code prevents perfect nesting
+    print *, i
     do j = 1, 10
       print *, i, j
     end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_imperfect_collapse_ordered_requires_perfect
+  integer i, j, k
+
+  ! collapse(2) ordered(3) requires perfect nesting at default OpenMP version because ordered(3) > collapse(2)
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: ORDERED clause was specified with argument 3
+  !$omp do collapse(2) ordered(3)
+  do i = 1, 10
     !BECAUSE: This code prevents perfect nesting
     print *, i
+    do j = 1, 10
+      do k = 1, 10
+        print *, i, j, k
+      end do
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_imperfect_array_assign_invalid
+  integer i, j
+  integer :: a(10)
+
+  ! Array assignment is invalid CLN intervening code
+  !ERROR: This construct requires a nest of depth 2, but the associated nest is a nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
+  !$omp do collapse(2)
+  do i = 1, 10
+    !BECAUSE: The nest contains code that prevents it from being canonical at this nesting level
+    a(:) = 0
+    do j = 1, 10
+      print *, i, j
+    end do
   end do
   !$omp end do
 end subroutine
diff --git a/flang/test/Semantics/OpenMP/doacross-nesting-omp52.f90 b/flang/test/Semantics/OpenMP/doacross-nesting-omp52.f90
new file mode 100644
index 0000000000000..492a961fa997f
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/doacross-nesting-omp52.f90
@@ -0,0 +1,110 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
+! OpenMP 5.2: doacross loop nests (those with ordered doacross(sink/source)
+! constructs) require perfect nesting.
+
+! ordered(2) without doacross directives: imperfect nesting is valid in 5.2.
+subroutine ordered_no_doacross_imperfect
+  integer i, j
+
+  !$omp do ordered(2)
+  do i = 1, 10
+    print *, i
+    do j = 1, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! Perfectly nested doacross: valid.
+subroutine doacross_perfect
+  integer i, j
+
+  !$omp do ordered(2)
+  do i = 1, 10
+    do j = 1, 10
+      !$omp ordered doacross(sink: i-1, j)
+      print *, i, j
+      !$omp ordered doacross(source)
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! Imperfectly nested doacross: invalid in 5.2.
+subroutine doacross_imperfect
+  integer i, j
+
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: ORDERED clause was specified with argument 2
+  !$omp do ordered(2)
+  do i = 1, 10
+    !BECAUSE: This code prevents perfect nesting
+    print *, i
+    do j = 1, 10
+      !$omp ordered doacross(sink: i-1, j)
+      print *, i, j
+      !$omp ordered doacross(source)
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! collapse(2) ordered(3) without doacross: imperfect nesting is valid.
+subroutine collapse_ordered_no_doacross_imperfect
+  integer i, j, k
+
+  !$omp do collapse(2) ordered(3)
+  do i = 1, 10
+    print *, i
+    do j = 1, 10
+      do k = 1, 10
+        print *, i, j, k
+      end do
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! Doacross with collapse: ordered(N) controls depth when N > collapse.
+subroutine doacross_collapse
+  integer i, j, k
+
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: ORDERED clause was specified with argument 3
+  !$omp do collapse(2) ordered(3)
+  do i = 1, 10
+    !BECAUSE: This code prevents perfect nesting
+    print *, i
+    do j = 1, 10
+      do k = 1, 10
+        !$omp ordered doacross(sink: i-1, j, k)
+        print *, i, j, k
+        !$omp ordered doacross(source)
+      end do
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! Doacross inside a nested OpenMP region should not force perfect nesting on
+! the outer loop. The doacross binds to the inner loop, not the outer one.
+subroutine doacross_in_nested_region
+  integer i, j, k
+
+  !$omp do collapse(2)
+  do i = 1, 10
+    print *, i
+    do j = 1, 10
+      !$omp parallel
+      !$omp do ordered(1)
+      do k = 1, 10
+        !$omp ordered doacross(source)
+        print *, k
+      end do
+      !$omp end do
+      !$omp end parallel
+    end do
+  end do
+  !$omp end do
+end subroutine

>From 2639171cf3c72a4c62d928daa48075144757cd30 Mon Sep 17 00:00:00 2001
From: Caroline Newcombe <caroline.newcombe at hpe.com>
Date: Tue, 16 Jun 2026 10:53:59 -0500
Subject: [PATCH 2/3] Remove brace initializations to match lowering style

---
 flang/lib/Lower/OpenMP/OpenMP.cpp | 70 +++++++++++++++----------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index f9eb59cbe0b95..3c0fce0703aa5 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -702,12 +702,12 @@ static void genCollapsedLoopNestBody(lower::AbstractConverter &converter,
     return;
   }
 
-  fir::FirOpBuilder &firOpBuilder{converter.getFirOpBuilder()};
-  const mlir::Location loc{converter.getCurrentLocation()};
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  const mlir::Location loc = converter.getCurrentLocation();
 
   // Get the enclosing omp.loop_nest to access induction variables and bounds.
-  auto loopNestOp{mlir::dyn_cast<mlir::omp::LoopNestOp>(
-      firOpBuilder.getInsertionBlock()->getParentOp())};
+  auto loopNestOp = mlir::dyn_cast<mlir::omp::LoopNestOp>(
+      firOpBuilder.getInsertionBlock()->getParentOp());
   assert(loopNestOp && "expected to be inside omp.loop_nest");
 
   // Collect before/after evaluations at each intermediate level.
@@ -717,11 +717,11 @@ static void genCollapsedLoopNestBody(lower::AbstractConverter &converter,
   };
   llvm::SmallVector<LevelInfo> levels;
 
-  lower::pft::Evaluation *curEval{&outerEval};
-  for (int i{0}; i < collapseValue - 1; ++i) {
-    lower::pft::Evaluation *const doEval{getNestedDoConstruct(*curEval)};
+  lower::pft::Evaluation *curEval = &outerEval;
+  for (int i = 0; i < collapseValue - 1; ++i) {
+    lower::pft::Evaluation *doEval = getNestedDoConstruct(*curEval);
     LevelInfo level;
-    bool pastDo{false};
+    bool pastDo = false;
     for (lower::pft::Evaluation &e : doEval->getNestedEvaluations()) {
       if (e.getIf<parser::NonLabelDoStmt>() || e.getIf<parser::EndDoStmt>())
         continue;
@@ -748,31 +748,31 @@ static void genCollapsedLoopNestBody(lower::AbstractConverter &converter,
   // steps where the IV may never exactly equal the upper bound.
   auto buildGuard = [&](const int startLevel, const int endLevel,
                         const bool useLowerBound) -> mlir::Value {
-    mlir::Value cond{};
-    const auto lbs{loopNestOp.getLoopLowerBounds()};
-    const auto ubs{loopNestOp.getLoopUpperBounds()};
-    const auto steps{loopNestOp.getLoopSteps()};
-    for (int lvl{startLevel}; lvl < endLevel; ++lvl) {
-      const mlir::Value iv{loopNestOp.getRegion().getArgument(lvl)};
+    mlir::Value cond;
+    const auto lbs = loopNestOp.getLoopLowerBounds();
+    const auto ubs = loopNestOp.getLoopUpperBounds();
+    const auto steps = loopNestOp.getLoopSteps();
+    for (int lvl = startLevel; lvl < endLevel; ++lvl) {
+      const mlir::Value iv = loopNestOp.getRegion().getArgument(lvl);
       mlir::Value target;
       if (useLowerBound) {
         target = lbs[lvl];
       } else {
         // For unit steps, the last induction variable always equals ub.
-        const auto constStep{fir::getIntIfConstant(steps[lvl])};
+        const auto constStep = fir::getIntIfConstant(steps[lvl]);
         if (constStep && (*constStep == 1 || *constStep == -1)) {
           target = ubs[lvl];
         } else {
           // Compute last_iv = lb + ((ub - lb) / step) * step.
-          const mlir::Value lb{lbs[lvl]};
-          const mlir::Value ub{ubs[lvl]};
-          const mlir::Value step{steps[lvl]};
-          const mlir::Value range{
-              mlir::arith::SubIOp::create(firOpBuilder, loc, ub, lb)};
-          const mlir::Value tripMinus1{
-              mlir::arith::DivSIOp::create(firOpBuilder, loc, range, step)};
-          const mlir::Value lastOffset{
-              mlir::arith::MulIOp::create(firOpBuilder, loc, tripMinus1, step)};
+          const mlir::Value lb = lbs[lvl];
+          const mlir::Value ub = ubs[lvl];
+          const mlir::Value step = steps[lvl];
+          const mlir::Value range =
+              mlir::arith::SubIOp::create(firOpBuilder, loc, ub, lb);
+          const mlir::Value tripMinus1 =
+              mlir::arith::DivSIOp::create(firOpBuilder, loc, range, step);
+          const mlir::Value lastOffset =
+              mlir::arith::MulIOp::create(firOpBuilder, loc, tripMinus1, step);
           target =
               mlir::arith::AddIOp::create(firOpBuilder, loc, lb, lastOffset);
         }
@@ -788,12 +788,12 @@ static void genCollapsedLoopNestBody(lower::AbstractConverter &converter,
   };
 
   // Emit "before" code at each level, guarded by inner IVs == lower bounds.
-  for (int i{0}; i < static_cast<int>(levels.size()); ++i) {
+  for (int i = 0; i < static_cast<int>(levels.size()); ++i) {
     if (levels[i].before.empty())
       continue;
-    const mlir::Value guard{
-        buildGuard(i + 1, collapseValue, /*useLowerBound=*/true)};
-    auto ifOp{fir::IfOp::create(firOpBuilder, loc, guard, /*else*/ false)};
+    const mlir::Value guard =
+        buildGuard(i + 1, collapseValue, /*useLowerBound=*/true);
+    auto ifOp = fir::IfOp::create(firOpBuilder, loc, guard, /*else*/ false);
     firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     for (auto *e : levels[i].before)
       converter.genEval(*e);
@@ -805,12 +805,12 @@ static void genCollapsedLoopNestBody(lower::AbstractConverter &converter,
 
   // Emit "after" code at each level (innermost first), guarded by
   // inner IVs == last iteration values (accounts for non-unit steps).
-  for (int i{static_cast<int>(levels.size()) - 1}; i >= 0; --i) {
+  for (int i = static_cast<int>(levels.size()) - 1; i >= 0; --i) {
     if (levels[i].after.empty())
       continue;
-    const mlir::Value guard{
-        buildGuard(i + 1, collapseValue, /*useLowerBound=*/false)};
-    auto ifOp{fir::IfOp::create(firOpBuilder, loc, guard, /*else*/ false)};
+    const mlir::Value guard =
+        buildGuard(i + 1, collapseValue, /*useLowerBound=*/false);
+    auto ifOp = fir::IfOp::create(firOpBuilder, loc, guard, /*else*/ false);
     firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     for (auto *e : levels[i].after)
       converter.genEval(*e);
@@ -1410,8 +1410,8 @@ struct OpWithBodyGenInfo {
   bool privatize = true;
   /// [in] if set, outermost evaluation and collapse depth for emitting
   /// intervening code from imperfect collapsed loop nests.
-  lower::pft::Evaluation *outerCollapseEval{nullptr};
-  int collapseValue{0};
+  lower::pft::Evaluation *outerCollapseEval = nullptr;
+  int collapseValue = 0;
 };
 
 /// Create the body (block) for an OpenMP Operation.
@@ -2438,7 +2438,7 @@ static void genCanonicalLoopNest(
   // Step 1: Loop prologues
   // Computing the trip count must happen before entering the outermost loop
   lower::pft::Evaluation *innermostEval = nestedEval;
-  for (std::size_t i{0}; i < ivs.size(); ++i) {
+  for (std::size_t i = 0; i < ivs.size(); ++i) {
     if (innermostEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
       // OpenMP specifies DO CONCURRENT only with the `!omp loop` construct.
       // Will need to add special cases for this combination.

>From ccb9dbd6bfff5dcd6b688683c51a65f39101759e Mon Sep 17 00:00:00 2001
From: Caroline Newcombe <caroline.newcombe at hpe.com>
Date: Thu, 18 Jun 2026 15:08:14 -0500
Subject: [PATCH 3/3] [flang][OpenMP] Refine perfect-nesting requirement for
 doacross loop nests

Fix the perfect-nesting requirement for doacross loop nests by version: in
5.0 any ORDERED clause requires a perfect nest, while in 5.2 an ORDERED
clause with an argument requires a perfect nest. In 6.0 the requirement is
keyed off the body containing an ORDERED directive with a doacross
dependence rather than off the ORDERED clause.

Add a public IsDoacrossAffected(const OpenMPLoopConstruct &) utility in
openmp-utils (with the DoacrossFinder body scan), used by check-omp-loop only
after 5.2. Expand the test coverage (renamed omp52 -> omp60 and added ordered-
nesting tests), including ORDERED with no argument defaulting to the COLLAPSE value.
---
 flang/include/flang/Semantics/openmp-utils.h  |  6 ++
 flang/lib/Semantics/check-omp-loop.cpp        | 52 ++---------
 flang/lib/Semantics/openmp-utils.cpp          | 93 +++++++++++++++++--
 ...g-omp52.f90 => doacross-nesting-omp60.f90} | 46 ++++++++-
 .../OpenMP/ordered-nesting-omp50.f90          | 50 ++++++++++
 .../OpenMP/ordered-nesting-omp51.f90          | 36 +++++++
 6 files changed, 223 insertions(+), 60 deletions(-)
 rename flang/test/Semantics/OpenMP/{doacross-nesting-omp52.f90 => doacross-nesting-omp60.f90} (68%)
 create mode 100644 flang/test/Semantics/OpenMP/ordered-nesting-omp50.f90
 create mode 100644 flang/test/Semantics/OpenMP/ordered-nesting-omp51.f90

diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index c2e89fe829ce0..4a519dbde6f33 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -270,6 +270,12 @@ std::optional<std::vector<const parser::DoConstruct *>> CollectAffectedDoLoops(
     const parser::OpenMPLoopConstruct &x, unsigned version,
     SemanticsContext *semaCtx = nullptr);
 
+/// Returns whether the loop nest associated with `x` is a doacross loop nest,
+/// i.e. its body contains an `ordered` directive carrying a doacross
+/// dependence (the `doacross` clause, or the pre-5.2 `depend(sink/source)`
+/// equivalent) that binds to `x`. Such a nest must be perfectly nested.
+bool IsDoacrossAffected(const parser::OpenMPLoopConstruct &x);
+
 struct LoopSequence {
   LoopSequence(const parser::ExecutionPartConstruct &root, unsigned version,
       bool allowAllLoops = false, SemanticsContext *semaCtx = nullptr);
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index a10d6e4fc93e1..87f6f28f2dda2 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -111,49 +111,6 @@ class AssociatedLoopChecker {
   std::int64_t level_;
   std::map<std::string, std::int64_t> constructNamesAndLevels_;
 };
-
-/// Visitor that detects an ordered directive with a doacross clause
-/// (or the pre-5.2 depend(sink/source) equivalent).
-/// Does not descend into nested OpenMP block or loop constructs, since
-/// doacross directives inside them bind to an inner worksharing-loop
-/// region, not the one being checked.
-struct DoacrossFinder {
-  bool found{false};
-  template <typename T> bool Pre(const T &) { return !found; }
-  template <typename T> void Post(const T &) {}
-
-  // Stop descent into nested OpenMP regions that create new binding contexts.
-  bool Pre(const parser::OmpBlockConstruct &) { return false; }
-  bool Pre(const parser::OpenMPLoopConstruct &) { return false; }
-
-  void Post(const parser::OpenMPSimpleStandaloneConstruct &x) {
-    if (found) {
-      return;
-    }
-    if (x.v.DirId() != llvm::omp::Directive::OMPD_ordered) {
-      return;
-    }
-    for (const auto &clause : x.v.Clauses().v) {
-      if (std::holds_alternative<parser::OmpClause::Doacross>(clause.u)) {
-        found = true;
-        return;
-      }
-      if (const auto *depend{
-              std::get_if<parser::OmpClause::Depend>(&clause.u)}) {
-        if (std::holds_alternative<parser::OmpDoacross>(depend->v.u)) {
-          found = true;
-          return;
-        }
-      }
-    }
-  }
-};
-
-bool ContainsDoacrossDirective(const parser::Block &block) {
-  DoacrossFinder finder;
-  parser::Walk(block, finder);
-  return finder.found;
-}
 } // namespace
 
 namespace Fortran::semantics {
@@ -370,9 +327,12 @@ void OmpStructureChecker::CheckNestedConstruct(
     auto [needDepth, needPerfect]{
         GetAffectedNestDepthWithReason(beginSpec, version)};
 
-    // In OpenMP 5.2+, perfect nesting is only required for doacross loop
-    // nests (those whose body contains ordered doacross directives).
-    if (!needPerfect && version >= 52 && ContainsDoacrossDirective(body)) {
+    // Perfect nesting for doacross loop nests is handled differently across
+    // versions. Only in 6.0+ is the requirement keyed off the body
+    // actually containing an ORDERED directive with a doacross dependence
+    // rather than the ORDERED clause, so the body scan applies only to those
+    // later versions.
+    if (!needPerfect && version > 52 && IsDoacrossAffected(x)) {
       needPerfect = true;
     }
 
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index edff3cb3e9cfd..9a3012ada6bad 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -989,18 +989,32 @@ std::pair<WithReason<int64_t>, bool> GetAffectedNestDepthWithReason(
       ocount = std::nullopt;
       oreason = Reason();
     }
+    bool hasOrdered{
+        parser::omp::FindClause(spec, llvm::omp::Clause::OMPC_ordered) !=
+        nullptr};
+    // Perfect-nesting requirement for the ORDERED clause, by version:
+    //
+    //   5.0:     Any ORDERED clause makes the associated loops a doacross loop
+    //            nest that must be perfectly nested, whether or not the clause
+    //            has an argument.
+    //   5.1/5.2: Only an ORDERED clause *with* an argument requires perfect
+    //            nesting; a bare ORDERED clause does not.
+    //   6.0:     Perfect nesting is required only when the body actually
+    //            contains an ORDERED directive with a doacross dependence;
+    //            that is detected separately by the caller via
+    //            IsDoacrossAffected, so ORDERED(n) alone does not force
+    //            perfect nesting here.
     if (ccount < ocount) {
-      // Prior to 5.2, ordered(n) requires perfect nesting unconditionally.
-      // In 5.2+, perfect nesting is required only for doacross (checked later
-      // with ContainsDoacrossDirective).
-      return {{ocount.value_or(1), std::move(oreason)}, version < 52};
-    }
-    // Prior to 5.2, ordered(n) requires perfect nesting.
-    // In 5.2+, only doacross nests require it (checked separately).
-    // CLN relaxation for collapse is applied retroactively for all versions.
+      return {{ocount.value_or(1), std::move(oreason)}, version <= 52};
+    }
+    // Same rule as above when COLLAPSE drives the depth: ORDERED(n) requires a
+    // perfect nest through 5.2, while > 5.2 defers to IsDoacrossAffected. In
+    // 5.0, an ORDERED clause without argument also requires perfect nesting.
+    // The CLN relaxation for COLLAPSE is applied retroactively for all
+    // versions.
     bool needPerfect{false};
-    if (version < 52) {
-      needPerfect = ocount.has_value();
+    if (version <= 52) {
+      needPerfect = ocount.has_value() || (version == 50 && hasOrdered);
     }
     return {{ccount.value_or(1), std::move(creason)}, needPerfect};
   }
@@ -1220,6 +1234,65 @@ std::optional<int64_t> GetMinimumSequenceCount(
   return GetMinimumSequenceCount(std::nullopt, std::nullopt);
 }
 
+namespace {
+/// Visitor that detects an `ordered` directive carrying a doacross dependence
+/// (the `doacross` clause, or the pre-5.2 `depend(sink/source)` equivalent)
+/// that binds to the loop construct being checked. Prunes nested constructs
+/// that start their own associated loop nest, but descends into
+/// loop-transforming constructs (e.g. tile, unroll), whose generated loops
+/// extend the current nest.
+struct DoacrossFinder {
+  bool found{false};
+  bool inOrdered{false};
+  template <typename T> bool Pre(const T &) { return !found; }
+  template <typename T> void Post(const T &) {}
+
+  // Prune nested constructs that start their own associated loop nest; a
+  // doacross inside them binds there, not here. Loop-transforming constructs
+  // are the exception: their generated loops extend the current nest, so a
+  // doacross inside one still binds to the construct being checked.
+  bool Pre(const parser::OmpBlockConstruct &) { return false; }
+  bool Pre(const parser::OpenMPLoopConstruct &x) {
+    if (IsLoopTransforming(x.BeginDir().DirId())) {
+      return !found;
+    }
+    return false;
+  }
+
+  bool Pre(const parser::OpenMPSimpleStandaloneConstruct &x) {
+    inOrdered = x.v.DirId() == llvm::omp::Directive::OMPD_ordered;
+    return !found;
+  }
+  void Post(const parser::OpenMPSimpleStandaloneConstruct &) {
+    inOrdered = false;
+  }
+
+  bool Pre(const parser::OmpDoacross &) {
+    if (inOrdered) {
+      found = true;
+    }
+    return false;
+  }
+};
+
+static bool ContainsOrderedDoacross(const parser::Block &block) {
+  DoacrossFinder finder;
+  parser::Walk(block, finder);
+  return finder.found;
+}
+} // namespace
+
+bool IsDoacrossAffected(const parser::OpenMPLoopConstruct &x) {
+  // A loop nest is doacross-affected when it has an `ordered` clause and a
+  // stand-alone `ordered` construct carrying a doacross dependence is closely
+  // nested in its body.
+  const parser::OmpDirectiveSpecification &spec{x.BeginDir()};
+  if (!parser::omp::FindClause(spec, llvm::omp::Clause::OMPC_ordered)) {
+    return false;
+  }
+  return ContainsOrderedDoacross(std::get<parser::Block>(x.t));
+}
+
 /// Collect the DO loops that are affected directly by the given loop
 /// transformation. Not all DO loops nested in the associated nest are
 /// affected by the top-level loop transformation, e.g.
diff --git a/flang/test/Semantics/OpenMP/doacross-nesting-omp52.f90 b/flang/test/Semantics/OpenMP/doacross-nesting-omp60.f90
similarity index 68%
rename from flang/test/Semantics/OpenMP/doacross-nesting-omp52.f90
rename to flang/test/Semantics/OpenMP/doacross-nesting-omp60.f90
index 492a961fa997f..d11be5a60e072 100644
--- a/flang/test/Semantics/OpenMP/doacross-nesting-omp52.f90
+++ b/flang/test/Semantics/OpenMP/doacross-nesting-omp60.f90
@@ -1,8 +1,8 @@
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
-! OpenMP 5.2: doacross loop nests (those with ordered doacross(sink/source)
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60
+! OpenMP 6.0: doacross loop nests (those with ordered doacross(sink/source)
 ! constructs) require perfect nesting.
 
-! ordered(2) without doacross directives: imperfect nesting is valid in 5.2.
+! ordered(2) without doacross directives: imperfect nesting is valid in 6.0.
 subroutine ordered_no_doacross_imperfect
   integer i, j
 
@@ -16,6 +16,23 @@ subroutine ordered_no_doacross_imperfect
   !$omp end do
 end subroutine
 
+! Bare ORDERED (no argument) carries no doacross intent and does not require
+! perfect nesting.
+subroutine bare_ordered_no_doacross
+  integer i, j, x
+
+  !$omp do ordered
+  do i = 1, 10
+    x = 0
+    do j = 1, 10
+      !$omp ordered
+      print *, i, j, x
+      !$omp end ordered
+    end do
+  end do
+  !$omp end do
+end subroutine
+
 ! Perfectly nested doacross: valid.
 subroutine doacross_perfect
   integer i, j
@@ -31,7 +48,7 @@ subroutine doacross_perfect
   !$omp end do
 end subroutine
 
-! Imperfectly nested doacross: invalid in 5.2.
+! Imperfectly nested doacross: invalid in 6.0.
 subroutine doacross_imperfect
   integer i, j
 
@@ -87,6 +104,27 @@ subroutine doacross_collapse
   !$omp end do
 end subroutine
 
+! ORDERED with no argument: the number of doacross-affected loops defaults to
+! the COLLAPSE value (2 here). An imperfectly nested doacross is therefore
+! invalid, and the diagnostic is keyed off COLLAPSE rather than ORDERED.
+subroutine doacross_collapse_ordered_default
+  integer i, j
+
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
+  !$omp do collapse(2) ordered
+  do i = 1, 10
+    !BECAUSE: This code prevents perfect nesting
+    print *, i
+    do j = 1, 10
+      !$omp ordered doacross(sink: i-1, j)
+      print *, i, j
+      !$omp ordered doacross(source)
+    end do
+  end do
+  !$omp end do
+end subroutine
+
 ! Doacross inside a nested OpenMP region should not force perfect nesting on
 ! the outer loop. The doacross binds to the inner loop, not the outer one.
 subroutine doacross_in_nested_region
diff --git a/flang/test/Semantics/OpenMP/ordered-nesting-omp50.f90 b/flang/test/Semantics/OpenMP/ordered-nesting-omp50.f90
new file mode 100644
index 0000000000000..f2b82c343cf9c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/ordered-nesting-omp50.f90
@@ -0,0 +1,50 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=50
+! OpenMP 5.0: an ORDERED clause requires perfect nesting, whether or not the
+! clause has an argument. This is stricter than 5.1/5.2, where only ORDERED(n)
+! (with an argument) requires perfect nesting (see ordered-nesting-omp51.f90).
+
+! Bare ORDERED with COLLAPSE(2): the two associated loops must be perfectly
+! nested in 5.0, so intervening code between them is an error.
+subroutine bare_ordered_collapse_imperfect
+  integer i, j
+
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
+  !$omp do collapse(2) ordered
+  do i = 1, 10
+    !BECAUSE: This code prevents perfect nesting
+    print *, i
+    do j = 1, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! Bare ORDERED with COLLAPSE(2), perfectly nested: valid.
+subroutine bare_ordered_collapse_perfect
+  integer i, j
+
+  !$omp do collapse(2) ordered
+  do i = 1, 10
+    do j = 1, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! Bare ORDERED with no COLLAPSE: only one loop is associated (depth 1), so
+! perfect nesting is trivially satisfied and intervening code is allowed.
+subroutine bare_ordered_no_collapse
+  integer i, j
+
+  !$omp do ordered
+  do i = 1, 10
+    print *, i
+    do j = 1, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/ordered-nesting-omp51.f90 b/flang/test/Semantics/OpenMP/ordered-nesting-omp51.f90
new file mode 100644
index 0000000000000..a5f282dbdcd24
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/ordered-nesting-omp51.f90
@@ -0,0 +1,36 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+! OpenMP 5.1/5.2: only an ORDERED clause *with* an argument requires perfect
+! nesting. A bare ORDERED clause does not, unlike in 5.0 (see
+! ordered-nesting-omp50.f90).
+
+! Bare ORDERED with COLLAPSE(2), imperfectly nested: valid in 5.1, since the
+! bare ORDERED clause does not impose perfect nesting.
+subroutine bare_ordered_collapse_imperfect
+  integer i, j
+
+  !$omp do collapse(2) ordered
+  do i = 1, 10
+    print *, i
+    do j = 1, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+! ORDERED(2) with an argument still requires perfect nesting.
+subroutine ordered_arg_imperfect
+  integer i, j
+
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: ORDERED clause was specified with argument 2
+  !$omp do ordered(2)
+  do i = 1, 10
+    !BECAUSE: This code prevents perfect nesting
+    print *, i
+    do j = 1, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine



More information about the flang-commits mailing list