[Mlir-commits] [flang] [mlir] [flang][acc] Fix collapse(force:N) + reduction producing redundant inner loop (PR #191312)

llvmlistbot at llvm.org llvmlistbot at llvm.org
Thu Apr 9 15:19:16 PDT 2026


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir

@llvm/pr-subscribers-mlir

Author: khaki3

<details>
<summary>Changes</summary>

When `collapse(force:2) reduction(+:r)` is applied to a nested loop, Flang's lowering creates an outer `acc.loop` with N control variables covering all collapsed dimensions. However, the PFT evaluation walker still encounters the inner `do` constructs and calls `genOpenACCLoopFromDoConstruct` for each, generating redundant inner `acc.loop` ops that re-iterate dimensions already distributed by the outer.

**Root cause**: `processDoLoopBounds` absorbs N levels of loop bounds into the outer `acc.loop`, but nothing prevents the PFT walker from also generating separate `acc.loop` ops for those same inner `do` constructs.

**Fix**: Add `isInsideCollapsedACCLoop()` which checks if the current insertion point is inside a collapsed `acc.loop` with unconsumed dimensions. In `Bridge.cpp`, when this returns true, skip both `acc.loop` generation and normal loop generation for the inner `do` construct — just lower the body directly. The IV is already available from the parent `acc.loop`'s block argument.

---
Full diff: https://github.com/llvm/llvm-project/pull/191312.diff


5 Files Affected:

- (modified) flang/include/flang/Lower/OpenACC.h (+6) 
- (modified) flang/lib/Lower/Bridge.cpp (+12) 
- (modified) flang/lib/Lower/OpenACC.cpp (+16) 
- (modified) flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90 (+7-9) 
- (added) mlir/test/Dialect/OpenACC/acc-compute-lowering-collapse-reduction.mlir (+31) 


``````````diff
diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index 745c8686457f4..15b53d3b3aae9 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -110,6 +110,12 @@ getCollapseSizeAndForce(const Fortran::parser::AccClauseList &);
 /// Checks whether the current insertion point is inside OpenACC loop.
 bool isInOpenACCLoop(fir::FirOpBuilder &);
 
+/// Checks whether the current insertion point is inside a collapsed acc.loop
+/// and there are still collapsed dimensions to consume (i.e., this do-loop
+/// should not generate a separate loop because the parent already distributes
+/// this dimension).
+bool isInsideCollapsedACCLoop(fir::FirOpBuilder &);
+
 /// Checks whether the current insertion point is inside OpenACC compute
 /// construct.
 bool isInsideOpenACCComputeConstruct(fir::FirOpBuilder &);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 63867fab57678..5bd862c74661b 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2493,6 +2493,18 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     Fortran::lower::pft::Evaluation &eval = getEval();
     bool unstructuredContext = eval.lowerAsUnstructured();
 
+    // If this tightly-nested do-loop is inside a collapsed acc.loop and
+    // its dimension is already covered by the parent's control variables,
+    // skip generating any loop — just lower the body.  The IV value is
+    // already available from the parent acc.loop's block argument (stored
+    // into the private variable by buildACCLoopOp).
+    if (Fortran::lower::isInsideCollapsedACCLoop(*builder)) {
+      auto iter = eval.getNestedEvaluations().begin();
+      for (auto end = --eval.getNestedEvaluations().end(); iter != end; ++iter)
+        genFIR(*iter, unstructuredContext);
+      return;
+    }
+
     // Loops with induction variables inside OpenACC compute constructs
     // need special handling to ensure that the IVs are privatized.
     if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) {
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 2154f38dca568..e181f2afc2290 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4546,6 +4546,22 @@ bool Fortran::lower::isInOpenACCLoop(fir::FirOpBuilder &builder) {
   return false;
 }
 
+bool Fortran::lower::isInsideCollapsedACCLoop(fir::FirOpBuilder &builder) {
+  auto parentLoop =
+      builder.getBlock()->getParent()->getParentOfType<mlir::acc::LoopOp>();
+  if (!parentLoop)
+    return false;
+  unsigned numParentIVs = parentLoop.getBody().getNumArguments();
+  if (numParentIVs <= 1)
+    return false;
+
+  // Check that there are still collapsed dimensions to consume:
+  // no inner acc.loop has been generated yet.
+  unsigned innerAccLoops = 0;
+  parentLoop.getBody().walk([&](mlir::acc::LoopOp) { ++innerAccLoops; });
+  return (innerAccLoops + 1 < numParentIVs);
+}
+
 bool Fortran::lower::isInsideOpenACCComputeConstruct(
     fir::FirOpBuilder &builder) {
   return mlir::isa_and_nonnull<ACC_COMPUTE_CONSTRUCT_OPS>(
diff --git a/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90 b/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
index ca932c1b159ba..006f613dd2109 100644
--- a/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
+++ b/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
@@ -1,7 +1,7 @@
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
 ! Verify collapse(force:2) sinks prologue (between loops) and epilogue (after inner loop)
-! into the acc.loop region body.
+! into the acc.loop region body without generating a redundant inner acc.loop.
 
 subroutine collapse_force_sink(n, m)
   integer, intent(in) :: n, m
@@ -22,20 +22,18 @@ subroutine collapse_force_sink(n, m)
 
 ! CHECK: func.func @_QPcollapse_force_sink(
 ! CHECK: acc.parallel
-! Ensure outer acc.loop is combined(parallel)
+! Ensure outer acc.loop is combined(parallel) with 2 IVs and no inner acc.loop
 ! CHECK: acc.loop combined(parallel)
-! Prologue: constant 4.2 and an assign before inner loop
+! Prologue: constant 4.2 and an assign before inner loop body
 ! CHECK: arith.constant 4.200000e+00
 ! CHECK: hlfir.assign
-! Inner loop and its body include 2.0 add and an assign
-! CHECK: acc.loop
+! Inner loop body (no acc.loop wrapping it): 2.0 add and an assign
 ! CHECK: arith.constant 2.000000e+00
 ! CHECK: arith.addf
 ! CHECK: hlfir.assign
-! Epilogue: constant 7.3 and an assign after inner loop
+! Epilogue: constant 7.3 and an assign after inner loop body
 ! CHECK: arith.constant 7.300000e+00
 ! CHECK: hlfir.assign
-! And the outer acc.loop has collapse = [2]
+! The acc.loop has collapse = [2] and no inner acc.loop
 ! CHECK: } attributes {collapse = [2]
-
-
+! CHECK-NOT: acc.loop
diff --git a/mlir/test/Dialect/OpenACC/acc-compute-lowering-collapse-reduction.mlir b/mlir/test/Dialect/OpenACC/acc-compute-lowering-collapse-reduction.mlir
new file mode 100644
index 0000000000000..5d2f01362f3b0
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-compute-lowering-collapse-reduction.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt %s -acc-compute-lowering | FileCheck %s
+
+// Test that a collapsed acc.loop with 2 IVs converts correctly to
+// scf.parallel with 2 dimensions and no redundant inner loops.
+
+// CHECK-LABEL: func.func @collapse_two_ivs
+// CHECK:       acc.compute_region
+// CHECK:       scf.parallel
+// CHECK-NOT:   scf.for
+// CHECK:       scf.reduce
+func.func @collapse_two_ivs(%buf: memref<1xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %c200 = arith.constant 200 : index
+
+  %dev = acc.copyin varPtr(%buf : memref<1xf32>) -> memref<1xf32>
+  acc.parallel dataOperands(%dev : memref<1xf32>) {
+    acc.loop combined(parallel) control(%i : index, %j : index) = (%c1, %c1 : index, index) to (%c100, %c200 : index, index) step (%c1, %c1 : index, index) {
+      %vi = arith.index_cast %i : index to i32
+      %vj = arith.index_cast %j : index to i32
+      %sum = arith.addi %vi, %vj : i32
+      %valf = arith.sitofp %sum : i32 to f32
+      memref.store %valf, %dev[%c0] : memref<1xf32>
+      acc.yield
+    } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true>, independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  acc.copyout accPtr(%dev : memref<1xf32>) to varPtr(%buf : memref<1xf32>)
+  return
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/191312


More information about the Mlir-commits mailing list