[flang-commits] [flang] [flang][acc] Fix crash on collapse(force:N) with non-tightly nested loops (PR #191310)

via flang-commits flang-commits at lists.llvm.org
Fri Apr 17 10:11:19 PDT 2026


https://github.com/khaki3 updated https://github.com/llvm/llvm-project/pull/191310

>From d5f542d5c4360c2c4bdaa67193a7f9af68e1908c Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Tue, 14 Apr 2026 17:53:38 -0700
Subject: [PATCH 1/4] [Flang][OpenACC] Fix collapse(force:N) with non-tightly
 nested loops

When collapse(force:N) is applied to non-tightly nested loops, the
compiler could crash or generate redundant inner loops.

Crashes occurred because getNestedEvaluations() was called without
checking hasNestedEvaluations() first. Add guards in hasEarlyReturn(),
createRegionOp(), and the collapse-force sinking logic in Bridge.cpp.

Redundant inner loops were generated because processDoLoopBounds
absorbed N levels of do-loops into the outer acc.loop, but the PFT
walker still generated separate acc.loop ops for those same loops.
Fix by tracking absorbed DoConstruct* pointers in visitLoopControl
and skipping them in genFIR(DoConstruct).

Made-with: Cursor
---
 flang/include/flang/Lower/OpenACC.h           | 10 ++++++
 flang/lib/Lower/Bridge.cpp                    | 36 +++++++++++++------
 flang/lib/Lower/OpenACC.cpp                   | 23 +++++++++++-
 .../acc-loop-collapse-force-lowering.f90      | 15 +++-----
 ...loop-collapse-force-non-tightly-nested.f90 | 28 +++++++++++++++
 5 files changed, 90 insertions(+), 22 deletions(-)
 create mode 100644 flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90

diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index 745c8686457f4..02b4331a1f64e 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -110,6 +110,16 @@ getCollapseSizeAndForce(const Fortran::parser::AccClauseList &);
 /// Checks whether the current insertion point is inside OpenACC loop.
 bool isInOpenACCLoop(fir::FirOpBuilder &);
 
+/// Record a DoConstruct as having been absorbed by a collapse clause.
+/// The PFT walker should skip generating a loop for it.
+void markDoConstructAsCollapsed(const Fortran::parser::DoConstruct &);
+
+/// Check whether a DoConstruct was absorbed by a collapse clause.
+bool isCollapsedDoConstruct(const Fortran::parser::DoConstruct &);
+
+/// Clear the collapsed DoConstruct tracking set.
+void clearCollapsedDoConstructs();
+
 /// Checks whether the current insertion point is inside OpenACC compute
 /// construct.
 bool isInsideOpenACCComputeConstruct(fir::FirOpBuilder &);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 19ab789018f40..f55db18265ba5 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2501,6 +2501,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     Fortran::lower::pft::Evaluation &eval = getEval();
     bool unstructuredContext = eval.lowerAsUnstructured();
 
+    // If this do-loop was absorbed by a collapse clause on a parent acc.loop,
+    // skip generating any loop — just lower the body.  The IV value is
+    // already available from the parent acc.loop's block argument.
+    if (Fortran::lower::isCollapsedDoConstruct(doConstruct)) {
+      auto iter = eval.getNestedEvaluations().begin();
+      for (auto end = --eval.getNestedEvaluations().end(); iter != end; ++iter)
+        genFIR(*iter, unstructuredContext);
+      return;
+    }
+
     // Loops with induction variables inside OpenACC compute constructs
     // need special handling to ensure that the IVs are privatized.
     if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) {
@@ -3591,6 +3601,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   }
 
   void genFIR(const Fortran::parser::OpenACCConstruct &acc) {
+    Fortran::lower::clearCollapsedDoConstructs();
     mlir::OpBuilder::InsertPoint insertPt = builder->saveInsertionPoint();
 
     // Cache constructs should not push/pop a scope because they need to update
@@ -3649,8 +3660,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
       if (curEval->lowerAsStructured()) {
         curEval = &curEval->getFirstNestedEvaluation();
-        for (uint64_t i = 1; i < loopCount; i++)
+        for (uint64_t i = 1; i < loopCount; i++) {
+          if (!curEval->hasNestedEvaluations())
+            break;
           curEval = &*std::next(curEval->getNestedEvaluations().begin());
+        }
       }
     }
 
@@ -3677,9 +3691,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           }
           prologue.push_back(&*it);
         }
-        // Semantics guarantees collapseDepth does not exceed nest depth
-        // so childLoop must be found here.
-        assert(childLoop && "Expected inner DoConstruct for collapse");
+        if (!childLoop)
+          break;
         parent = childLoop;
         innermostLoopEval = childLoop;
       }
@@ -3700,16 +3713,19 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       sink(prologue);
 
       // Lower innermost loop body, skipping sunk
-      for (Fortran::lower::pft::Evaluation &e :
-           innermostLoopEval->getNestedEvaluations())
-        if (!sunk.contains(&e))
-          genFIR(e);
+      if (innermostLoopEval && innermostLoopEval->hasNestedEvaluations())
+        for (Fortran::lower::pft::Evaluation &e :
+             innermostLoopEval->getNestedEvaluations())
+          if (!sunk.contains(&e))
+            genFIR(e);
 
       sink(epilogue);
     } else {
       // Normal lowering
-      for (Fortran::lower::pft::Evaluation &e : curEval->getNestedEvaluations())
-        genFIR(e);
+      if (curEval->hasNestedEvaluations())
+        for (Fortran::lower::pft::Evaluation &e :
+             curEval->getNestedEvaluations())
+          genFIR(e);
     }
     if (!isCacheConstruct)
       localSymbols.popScope();
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 5a7fe899b372f..1eb9dd7cf22f2 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1215,7 +1215,8 @@ createRegionOp(fir::FirOpBuilder &builder, mlir::Location loc,
 
   // If it is an unstructured region and is not the outer region of a combined
   // construct, create empty blocks for all evaluations.
-  if (eval.lowerAsUnstructured() && !outerCombined)
+  if (eval.lowerAsUnstructured() && !outerCombined &&
+      eval.hasNestedEvaluations())
     Fortran::lower::createEmptyRegionBlocks<mlir::acc::TerminatorOp,
                                             mlir::acc::YieldOp>(
         builder, eval.getNestedEvaluations());
@@ -1544,6 +1545,7 @@ static void visitLoopControl(
       if (!innerDo)
         break; // No deeper loop; stop collecting collapsed bounds.
 
+      Fortran::lower::markDoConstructAsCollapsed(*innerDo);
       loopControl = &*innerDo->GetLoopControl();
       mlir::Location loc =
           converter.genLocation(Fortran::parser::FindSourceLocation(*innerDo));
@@ -2033,6 +2035,8 @@ buildACCLoopOp(Fortran::lower::AbstractConverter &converter,
 }
 
 static bool hasEarlyReturn(Fortran::lower::pft::Evaluation &eval) {
+  if (!eval.hasNestedEvaluations())
+    return false;
   bool hasReturnStmt = false;
   for (auto &e : eval.getNestedEvaluations()) {
     e.visit(Fortran::common::visitors{
@@ -4684,6 +4688,23 @@ bool Fortran::lower::isInOpenACCLoop(fir::FirOpBuilder &builder) {
   return false;
 }
 
+static llvm::SmallPtrSet<const Fortran::parser::DoConstruct *, 8>
+    collapsedDoConstructs;
+
+void Fortran::lower::markDoConstructAsCollapsed(
+    const Fortran::parser::DoConstruct &dc) {
+  collapsedDoConstructs.insert(&dc);
+}
+
+bool Fortran::lower::isCollapsedDoConstruct(
+    const Fortran::parser::DoConstruct &dc) {
+  return collapsedDoConstructs.contains(&dc);
+}
+
+void Fortran::lower::clearCollapsedDoConstructs() {
+  collapsedDoConstructs.clear();
+}
+
 bool Fortran::lower::isInsideOpenACCComputeConstruct(
     fir::FirOpBuilder &builder) {
   return mlir::isa_and_nonnull<ACC_COMPUTE_CONSTRUCT_OPS>(
diff --git a/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90 b/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
index ca932c1b159ba..0850eb5b696e8 100644
--- a/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
+++ b/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
@@ -1,7 +1,7 @@
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
-! Verify collapse(force:2) sinks prologue (between loops) and epilogue (after inner loop)
-! into the acc.loop region body.
+! Verify collapse(force:2) sinks prologue/epilogue and eliminates
+! the inner loop (absorbed by the outer collapse).
 
 subroutine collapse_force_sink(n, m)
   integer, intent(in) :: n, m
@@ -22,20 +22,13 @@ subroutine collapse_force_sink(n, m)
 
 ! CHECK: func.func @_QPcollapse_force_sink(
 ! CHECK: acc.parallel
-! Ensure outer acc.loop is combined(parallel)
-! CHECK: acc.loop combined(parallel)
-! Prologue: constant 4.2 and an assign before inner loop
+! Only one acc.loop should exist (the outer collapsed one)
+! CHECK-COUNT-1: acc.loop
 ! CHECK: arith.constant 4.200000e+00
 ! CHECK: hlfir.assign
-! Inner loop and its body include 2.0 add and an assign
-! CHECK: acc.loop
 ! CHECK: arith.constant 2.000000e+00
 ! CHECK: arith.addf
 ! CHECK: hlfir.assign
-! Epilogue: constant 7.3 and an assign after inner loop
 ! CHECK: arith.constant 7.300000e+00
 ! CHECK: hlfir.assign
-! And the outer acc.loop has collapse = [2]
 ! CHECK: } attributes {collapse = [2]
-
-
diff --git a/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90 b/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90
new file mode 100644
index 0000000000000..5eed4dc5a5c76
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90
@@ -0,0 +1,28 @@
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! Verify that collapse(force:N) on non-tightly nested loops (with code
+! between loops) does not crash the compiler.
+
+subroutine collapse_force_non_tight(a, n)
+  implicit none
+  integer, intent(in) :: n
+  real, intent(inout) :: a(n, n, n)
+  integer :: i, j, k
+  real :: tmp
+
+  !$acc parallel loop collapse(force:3) copy(a)
+  do i = 1, n
+    do j = 1, n
+      tmp = real(i + j)
+      do k = 1, n
+        a(i, j, k) = tmp + real(k)
+      end do
+    end do
+  end do
+  !$acc end parallel loop
+end subroutine
+
+! CHECK: func.func @_QPcollapse_force_non_tight
+! CHECK: acc.parallel
+! CHECK: acc.loop
+! CHECK: collapse = [3]

>From 8fd1c73176f391bbbca7b9555301e85eeb3411ac Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 16 Apr 2026 16:42:38 -0700
Subject: [PATCH 2/4] [Flang][OpenACC] Improve collapse(force:N) tests per
 review

- Verify prologue/body/epilogue are inside the collapsed loop body
- Add test for collapse(force:2) with inner !$acc loop that should
  not be absorbed by the collapse

Made-with: Cursor
---
 .../acc-loop-collapse-force-lowering.f90      | 10 ++--
 ...loop-collapse-force-non-tightly-nested.f90 | 49 +++++++++++++++++--
 2 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90 b/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
index 0850eb5b696e8..0effb34f9e301 100644
--- a/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
+++ b/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
@@ -20,15 +20,15 @@ subroutine collapse_force_sink(n, m)
   !$acc end parallel loop
 end subroutine
 
-! CHECK: func.func @_QPcollapse_force_sink(
+! CHECK-LABEL: func.func @_QPcollapse_force_sink(
 ! CHECK: acc.parallel
-! Only one acc.loop should exist (the outer collapsed one)
-! CHECK-COUNT-1: acc.loop
+! Only one acc.loop with collapse = [2], inner loop absorbed
+! CHECK: acc.loop combined(parallel)
+! Prologue (4.2), body (2.0 add), epilogue (7.3) all inside
 ! CHECK: arith.constant 4.200000e+00
 ! CHECK: hlfir.assign
 ! CHECK: arith.constant 2.000000e+00
-! CHECK: arith.addf
 ! CHECK: hlfir.assign
 ! CHECK: arith.constant 7.300000e+00
 ! CHECK: hlfir.assign
-! CHECK: } attributes {collapse = [2]
+! CHECK: collapse = [2]
diff --git a/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90 b/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90
index 5eed4dc5a5c76..2ac4fa1144926 100644
--- a/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90
+++ b/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90
@@ -1,7 +1,8 @@
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
-! Verify that collapse(force:N) on non-tightly nested loops (with code
-! between loops) does not crash the compiler.
+! Test 1: collapse(force:3) on non-tightly nested loops.
+! The inner do k loop should be absorbed by the collapse.
+! The prologue (tmp = ...) should be inside the collapsed acc.loop body.
 
 subroutine collapse_force_non_tight(a, n)
   implicit none
@@ -22,7 +23,47 @@ subroutine collapse_force_non_tight(a, n)
   !$acc end parallel loop
 end subroutine
 
-! CHECK: func.func @_QPcollapse_force_non_tight
+! CHECK-LABEL: func.func @_QPcollapse_force_non_tight
 ! CHECK: acc.parallel
-! CHECK: acc.loop
+! Only one acc.loop with 3 IVs
+! CHECK: acc.loop combined(parallel)
+! Prologue (tmp = i + j) inside the loop body
+! CHECK: arith.addi
+! Body assignment inside the loop body
+! CHECK: hlfir.designate
+! CHECK: hlfir.assign
 ! CHECK: collapse = [3]
+
+! -----
+
+! Test 2: collapse(force:2) with a non-collapsed inner !$acc loop.
+! The do k loop has its own !$acc loop directive and should NOT be
+! absorbed by the collapse — it must remain as a separate acc.loop.
+
+subroutine collapse_force_with_inner_acc_loop(a, n)
+  implicit none
+  integer, intent(in) :: n
+  real, intent(inout) :: a(n, n, n)
+  integer :: i, j, k
+
+  !$acc parallel loop collapse(force:2) copy(a)
+  do i = 1, n
+    do j = 1, n
+      !$acc loop
+      do k = 1, n
+        a(i, j, k) = real(i + j + k)
+      end do
+    end do
+  end do
+  !$acc end parallel loop
+end subroutine
+
+! CHECK-LABEL: func.func @_QPcollapse_force_with_inner_acc_loop
+! CHECK: acc.parallel
+! Outer collapsed acc.loop
+! CHECK: acc.loop combined(parallel)
+! Inner acc.loop for k should remain (not absorbed by collapse)
+! CHECK: acc.loop
+! CHECK: acc.yield
+! Outer has collapse = [2]
+! CHECK: collapse = [2]

>From 59397ecaa89e823b8f84b80d0e377ded2b4ceee2 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 16 Apr 2026 22:13:53 -0700
Subject: [PATCH 3/4] [Flang][OpenACC] Add collapse(force:3) + inner acc loop
 test

Test that collapse(force:3) correctly absorbs 3 loops while preserving
a deeper !$acc loop directive, with statements before the inner loop.

Made-with: Cursor
---
 ...loop-collapse-force-non-tightly-nested.f90 | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90 b/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90
index 2ac4fa1144926..36c15143bff5a 100644
--- a/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90
+++ b/flang/test/Lower/OpenACC/acc-loop-collapse-force-non-tightly-nested.f90
@@ -67,3 +67,46 @@ subroutine collapse_force_with_inner_acc_loop(a, n)
 ! CHECK: acc.yield
 ! Outer has collapse = [2]
 ! CHECK: collapse = [2]
+
+! -----
+
+! Test 3: collapse(force:3) with statements before an inner !$acc loop.
+! The do k loop is absorbed by the collapse (3 IVs), but do l with
+! its own !$acc loop directive should remain as a separate acc.loop.
+
+subroutine collapse_force3_with_inner_acc_loop(a, n)
+  implicit none
+  integer, intent(in) :: n
+  real, intent(inout) :: a(n, n, n, n)
+  integer :: i, j, k, l
+  real :: tmp
+
+  !$acc parallel loop collapse(force:3) copy(a)
+  do i = 1, n
+    do j = 1, n
+      tmp = real(i + j)
+      do k = 1, n
+        a(i, j, k, 1) = tmp
+        !$acc loop
+        do l = 1, n
+          a(i, j, k, l) = a(i, j, k, l) + real(l)
+        end do
+      end do
+    end do
+  end do
+  !$acc end parallel loop
+end subroutine
+
+! CHECK-LABEL: func.func @_QPcollapse_force3_with_inner_acc_loop
+! CHECK: acc.parallel
+! Outer collapsed acc.loop with 3 IVs
+! CHECK: acc.loop combined(parallel)
+! Prologue (tmp = i + j) inside the collapsed body
+! CHECK: arith.addi
+! Statement before inner acc loop (a(i,j,k,1) = tmp)
+! CHECK: hlfir.assign
+! Inner acc.loop for l should remain
+! CHECK: acc.loop
+! CHECK: acc.yield
+! Outer has collapse = [3]
+! CHECK: collapse = [3]

>From fdaf594352f53208ef595ccdabbacd38760e766d Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Fri, 17 Apr 2026 10:11:04 -0700
Subject: [PATCH 4/4] [Flang][OpenACC] Address review: improve tests, add
 semantics check

Made-with: Cursor
---
 .../Semantics/OpenACC/acc-collapse-force.f90   | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/flang/test/Semantics/OpenACC/acc-collapse-force.f90 b/flang/test/Semantics/OpenACC/acc-collapse-force.f90
index 80b1060ebe6c7..f613a0fb77ce5 100644
--- a/flang/test/Semantics/OpenACC/acc-collapse-force.f90
+++ b/flang/test/Semantics/OpenACC/acc-collapse-force.f90
@@ -17,3 +17,21 @@ subroutine sub3()
     enddo
   enddo
 end subroutine
+
+! Check that !$acc loop on a loop inside a collapse nest is rejected.
+subroutine sub4()
+  integer :: i, j, k
+  integer, parameter :: n = 100
+  real :: a(n, n, n), tmp
+  !$acc parallel loop collapse(force:3) copy(a)
+  do i = 1, n
+    do j = 1, n
+      tmp = real(i + j)
+      !ERROR: LOOP directive not expected in COLLAPSE loop nest
+      !$acc loop
+      do k = 1, n
+        a(i, j, k) = tmp + real(k)
+      end do
+    end do
+  end do
+end subroutine



More information about the flang-commits mailing list