[flang-commits] [flang] [flang][OpenACC] support collapse on unstructured acc.loop (PR #196174)

Eugene Epshteyn via flang-commits flang-commits at lists.llvm.org
Wed May 6 20:19:31 PDT 2026


https://github.com/eugeneepshteyn updated https://github.com/llvm/llvm-project/pull/196174

>From 21a24de472af9b7559693f063eaba9a09acd33fc Mon Sep 17 00:00:00 2001
From: Eugene Epshteyn <eepshteyn at nvidia.com>
Date: Wed, 6 May 2026 16:19:52 -0400
Subject: [PATCH 1/3] [flang][OpenACC] support collapse on unstructured
 acc.loop

PR #164992 added unstructured-loop support to OpenACC lowering (no bounds
on acc.loop, IVs privatized, body emitted as explicit cf), but it didn't
covered the `collapse(N)` case. Compiling

  !$acc parallel loop collapse(2)
  do j = 1, n
    do i = 1, n
      if (i == jdiag) then
        a(i,j) = 0.0d0
        cycle
      end if
      a(i,j) = real(i + j, 8)
    end do
  end do

asserted in MLIR's runRegionDCE: "Assertion `mightHaveTerminator()' failed".

Root cause: visitLoopControl unconditionally marked every inner DO of a
collapsed nest via markDoConstructAsCollapsed. genFIR(DoConstruct) then
read that marking and skipped the inner DO's loop machinery on the
assumption that the parent acc.loop iterates and supplies the IV via a
block argument. That assumption holds for the structured case, but not for
the unstructured case added in #164992. Skipping it left the
PFT-pre-allocated scaffold blocks (pre-header, header, exit) without
terminators.

Fix: add a markInnerCollapsed parameter (default true) to
visitLoopControl, and pass false from privatizeInductionVariables (the
unstructured case of buildACCLoopOp).

Assisted-by: AI
---
 flang/lib/Lower/OpenACC.cpp                   | 24 ++++-
 flang/test/Lower/OpenACC/acc-unstructured.f90 | 88 +++++++++++++++++++
 2 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 161023aa51db8..f6025d36ef599 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1504,13 +1504,22 @@ static void determineDefaultLoopParMode(
 }
 
 // Helper to visit Bounds of DO LOOP nest.
+//
+// When `markInnerCollapsed` is true (the default), inner DOs that are absorbed
+// into the parent acc.loop's collapse semantics are tagged via
+// markDoConstructAsCollapsed so that genFIR(DoConstruct) skips re-emitting
+// their loop machinery (the parent acc.loop iterates and supplies the IV via
+// a block argument). Pass `false` when the parent acc.loop is unstructured:
+// it has no IV block arguments and the inner DO's iteration mechanics must
+// be emitted as ordinary control flow inside the acc.loop body.
 static void visitLoopControl(
     Fortran::lower::AbstractConverter &converter,
     const Fortran::parser::DoConstruct &outerDoConstruct,
     uint64_t loopsToProcess, Fortran::lower::pft::Evaluation &eval,
     std::function<void(const Fortran::parser::LoopControl::Bounds &,
                        mlir::Location)>
-        callback) {
+        callback,
+    bool markInnerCollapsed = true) {
   Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
   for (uint64_t i = 0; i < loopsToProcess; ++i) {
     const Fortran::parser::LoopControl *loopControl;
@@ -1537,7 +1546,8 @@ static void visitLoopControl(
       if (!innerDo)
         break; // No deeper loop; stop collecting collapsed bounds.
 
-      Fortran::lower::markDoConstructAsCollapsed(*innerDo);
+      if (markInnerCollapsed)
+        Fortran::lower::markDoConstructAsCollapsed(*innerDo);
       mlir::Location loc =
           converter.genLocation(Fortran::parser::FindSourceLocation(*innerDo));
       if (innerDo->IsDoConcurrent())
@@ -1920,6 +1930,13 @@ static void privatizeInductionVariables(
   llvm::SmallVector<mlir::Location> ivLocs;
   assert(!outerDoConstruct.IsDoConcurrent() &&
          "do concurrent loops are not expected to contained early exits");
+  // Do not mark inner DOs as "collapsed": the parent acc.loop is unstructured
+  // and has no IV block arguments, so each inner DO must emit its own
+  // iteration mechanics as cf inside the acc.loop body. Marking them collapsed
+  // would cause genFIR(DoConstruct) to lower only the body and leave the
+  // pre-allocated PFT scaffold blocks (preheader, header, exit) without
+  // terminators -- which then trips runRegionDCE's mightHaveTerminator
+  // assertion.
   visitLoopControl(converter, outerDoConstruct, loopsToProcess, eval,
                    [&](const Fortran::parser::LoopControl::Bounds &bounds,
                        mlir::Location loc) {
@@ -1928,7 +1945,8 @@ static void privatizeInductionVariables(
                          bounds.Name().thing.symbol->GetUltimate();
                      privatizeIv(converter, ivSym, currentLocation, ivTypes,
                                  ivLocs, privateOperands, ivPrivate);
-                   });
+                   },
+                   /*markInnerCollapsed=*/false);
 }
 
 static mlir::acc::LoopOp
diff --git a/flang/test/Lower/OpenACC/acc-unstructured.f90 b/flang/test/Lower/OpenACC/acc-unstructured.f90
index 57b678c1200d9..8c5fbf568149e 100644
--- a/flang/test/Lower/OpenACC/acc-unstructured.f90
+++ b/flang/test/Lower/OpenACC/acc-unstructured.f90
@@ -222,3 +222,91 @@ subroutine test_unstructured8(a, n)
 ! CHECK: fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK: arith.cmpi eq
 ! CHECK: cf.cond_br
+
+! Test that `acc parallel loop collapse(N)` whose body has an early-exit
+! (here, `if (cond) then ... cycle ... end if`) lowers cleanly. The
+! corresponding acc.loop must privatize all N induction variables, carry
+! both `collapse = [N]` and `unstructured` attributes, and emit the
+! iteration mechanics for all N levels as explicit cf inside the body.
+! Reproducer derived from lorado issue #2856.
+subroutine test_unstructured_collapse_cycle(a)
+  integer :: i, j, jdiag
+  real(8) :: a(:,:)
+  jdiag = 4
+  !$acc parallel loop collapse(2) copy(a)
+  do j = 1, 8
+    do i = 1, 8
+      if (i == jdiag) then
+        a(i, j) = 0.0d0
+        cycle
+      end if
+      a(i, j) = real(i + j, 8)
+    end do
+  end do
+  !$acc end parallel loop
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_unstructured_collapse_cycle
+! CHECK: acc.parallel combined(loop)
+! Both induction variables (j and i) are privatized:
+! CHECK: %[[PRIVJ:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) recipe(@privatization_ref_i32) -> !fir.ref<i32> {implicit = true, name = "j"}
+! CHECK: %[[PRIVI:.*]] = acc.private varPtr(%{{.*}} : !fir.ref<i32>) recipe(@privatization_ref_i32) -> !fir.ref<i32> {implicit = true, name = "i"}
+! No control(...) on acc.loop — bounds are not on the op:
+! CHECK: acc.loop combined(parallel) private(%[[PRIVJ]], %[[PRIVI]] : !fir.ref<i32>, !fir.ref<i32>) {
+! Outer loop trip-count test (j) emitted as cf:
+! CHECK: arith.cmpi sgt
+! CHECK: cf.cond_br
+! Inner loop trip-count test (i) emitted as cf:
+! CHECK: arith.cmpi sgt
+! CHECK: cf.cond_br
+! The if/cycle is a structured cf branch in the body:
+! CHECK: arith.cmpi eq
+! CHECK: cf.cond_br
+! CHECK: acc.yield
+! CHECK: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>], unstructured}
+
+! Test that `acc parallel loop collapse(N)` lowers cleanly when the early-exit
+! is a STOP (the form already covered for collapse=1 by test_unstructured2).
+subroutine test_unstructured_collapse_stop(a)
+  integer :: i, j, k
+  real :: a(:,:,:)
+  !$acc parallel loop collapse(3)
+  do i = 1, 10
+    do j = 1, 10
+      do k = 1, 10
+        if (a(1,2,3) > 10) stop 'just to be unstructured'
+      end do
+    end do
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_unstructured_collapse_stop
+! All three IVs privatized:
+! CHECK: acc.private varPtr(%{{.*}} : !fir.ref<i32>) recipe(@privatization_ref_i32) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK: acc.private varPtr(%{{.*}} : !fir.ref<i32>) recipe(@privatization_ref_i32) -> !fir.ref<i32> {implicit = true, name = "j"}
+! CHECK: acc.private varPtr(%{{.*}} : !fir.ref<i32>) recipe(@privatization_ref_i32) -> !fir.ref<i32> {implicit = true, name = "k"}
+! CHECK: acc.loop combined(parallel) private(%{{.*}}, %{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
+! CHECK: fir.call @_FortranAStopStatementText
+! CHECK: } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>], unstructured}
+
+! Test the standalone `acc loop collapse(N)` arm (no combined parallel/kernels).
+subroutine test_unstructured_collapse_loop_only(a)
+  integer :: i, j, jdiag
+  real(8) :: a(:,:)
+  jdiag = 4
+  !$acc loop collapse(2)
+  do j = 1, 8
+    do i = 1, 8
+      if (i == jdiag) then
+        a(i, j) = 0.0d0
+        cycle
+      end if
+      a(i, j) = real(i + j, 8)
+    end do
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_unstructured_collapse_loop_only
+! Standalone acc.loop (no `combined(...)`):
+! CHECK: acc.loop private(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) {
+! CHECK: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>], unstructured}

>From f34d77aa80e2f47bf71a42f18d0594c09b50bdac Mon Sep 17 00:00:00 2001
From: Eugene Epshteyn <eepshteyn at nvidia.com>
Date: Wed, 6 May 2026 16:26:24 -0400
Subject: [PATCH 2/3] clang-format

---
 flang/lib/Lower/OpenACC.cpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index f6025d36ef599..2e0ae0bdf7a27 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1937,16 +1937,17 @@ static void privatizeInductionVariables(
   // pre-allocated PFT scaffold blocks (preheader, header, exit) without
   // terminators -- which then trips runRegionDCE's mightHaveTerminator
   // assertion.
-  visitLoopControl(converter, outerDoConstruct, loopsToProcess, eval,
-                   [&](const Fortran::parser::LoopControl::Bounds &bounds,
-                       mlir::Location loc) {
-                     locs.push_back(loc);
-                     Fortran::semantics::Symbol &ivSym =
-                         bounds.Name().thing.symbol->GetUltimate();
-                     privatizeIv(converter, ivSym, currentLocation, ivTypes,
-                                 ivLocs, privateOperands, ivPrivate);
-                   },
-                   /*markInnerCollapsed=*/false);
+  visitLoopControl(
+      converter, outerDoConstruct, loopsToProcess, eval,
+      [&](const Fortran::parser::LoopControl::Bounds &bounds,
+          mlir::Location loc) {
+        locs.push_back(loc);
+        Fortran::semantics::Symbol &ivSym =
+            bounds.Name().thing.symbol->GetUltimate();
+        privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
+                    privateOperands, ivPrivate);
+      },
+      /*markInnerCollapsed=*/false);
 }
 
 static mlir::acc::LoopOp

>From 0cef048c2ce3e99daacc45b183c6e4aece7db897 Mon Sep 17 00:00:00 2001
From: Eugene Epshteyn <eepshteyn at nvidia.com>
Date: Wed, 6 May 2026 23:19:21 -0400
Subject: [PATCH 3/3] Update test description for acc loop collapse

---
 flang/test/Lower/OpenACC/acc-unstructured.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Lower/OpenACC/acc-unstructured.f90 b/flang/test/Lower/OpenACC/acc-unstructured.f90
index 8c5fbf568149e..ce58ae90bdc35 100644
--- a/flang/test/Lower/OpenACC/acc-unstructured.f90
+++ b/flang/test/Lower/OpenACC/acc-unstructured.f90
@@ -289,7 +289,7 @@ subroutine test_unstructured_collapse_stop(a)
 ! CHECK: fir.call @_FortranAStopStatementText
 ! CHECK: } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>], unstructured}
 
-! Test the standalone `acc loop collapse(N)` arm (no combined parallel/kernels).
+! Test orphaned `acc loop collapse(N)`
 subroutine test_unstructured_collapse_loop_only(a)
   integer :: i, j, jdiag
   real(8) :: a(:,:)



More information about the flang-commits mailing list