[flang-commits] [flang] [flang][acc] Implement cache directive lowering (PR #174897)

via flang-commits flang-commits at lists.llvm.org
Wed Jan 7 18:50:47 PST 2026


https://github.com/khaki3 updated https://github.com/llvm/llvm-project/pull/174897

>From 7ecc7bdf41a4968832c20759afd6a59eaff30186 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 14:17:15 -0800
Subject: [PATCH 1/4] [flang][ACC] Implement cache directive lowering

The acc.cache operation is currently defined to be associated with a
loop. However, this implementation generates acc.cache as a standalone
data entry operation, similar to acc.private. The acc.cache operation
definition will be updated in a future change to reflect this usage.

Key implementation details:
- Add genCacheBounds() to generate acc.bounds for cache operands,
  handling single elements (arr(i)), full ranges (arr(l:u)), and
  partial ranges with missing bounds (arr(l:) or arr(:u))
- Generate acc.cache or acc.cache_readonly based on the readonly
  modifier
- Update the symbol map so subsequent lowering uses the cache result
- Insert cache operations after loop iterator setup
---
 flang/lib/Lower/Bridge.cpp  |  12 ++-
 flang/lib/Lower/OpenACC.cpp | 195 +++++++++++++++++++++++++++++++++++-
 2 files changed, 203 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 6c3631438a596..97bbda2db97a2 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3506,7 +3506,14 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
   void genFIR(const Fortran::parser::OpenACCConstruct &acc) {
     mlir::OpBuilder::InsertPoint insertPt = builder->saveInsertionPoint();
-    localSymbols.pushScope();
+
+    // Cache constructs should not push/pop a scope because they need to update
+    // the symbol map for subsequent statements in the same loop body.
+    bool isCacheConstruct =
+        std::holds_alternative<Fortran::parser::OpenACCCacheConstruct>(acc.u);
+
+    if (!isCacheConstruct)
+      localSymbols.pushScope();
     mlir::Value exitCond = genOpenACCConstruct(
         *this, bridge.getSemanticsContext(), getEval(), acc, localSymbols);
 
@@ -3605,7 +3612,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       for (Fortran::lower::pft::Evaluation &e : curEval->getNestedEvaluations())
         genFIR(e);
     }
-    localSymbols.popScope();
+    if (!isCacheConstruct)
+      localSymbols.popScope();
     builder->restoreInsertionPoint(insertPt);
 
     if (accLoop && exitCond) {
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 52fee7baf9de1..a189736c56e40 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4837,12 +4837,203 @@ genACC(Fortran::lower::AbstractConverter &converter,
       atomicConstruct.u);
 }
 
+/// Generate acc.bounds for cache directive. Handles:
+/// - Single element: arr(i) or arr(5)
+/// - Full range: arr(lower:upper)
+/// - Missing upper: arr(lower:) - uses array's upper bound
+/// - Missing lower: arr(:upper) - uses array's lower bound
+static void
+genCacheBounds(Fortran::lower::AbstractConverter &converter,
+               Fortran::semantics::SemanticsContext &semanticsContext,
+               Fortran::lower::StatementContext &stmtCtx,
+               const Fortran::parser::AccObject &accObject,
+               std::stringstream &asFortran,
+               llvm::SmallVectorImpl<mlir::Value> &bounds) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mlir::Location loc = converter.getCurrentLocation();
+  mlir::Type idxTy = builder.getIndexType();
+  mlir::Type boundTy = builder.getType<mlir::acc::DataBoundsType>();
+
+  Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
+  Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
+
+  std::optional<Fortran::evaluate::DataRef> dataRef;
+  Fortran::semantics::MaybeExpr designator = Fortran::common::visit(
+      [&](auto &&s) { return ea.Analyze(s); }, accObject.u);
+  if (designator)
+    dataRef = Fortran::evaluate::ExtractDataRef(*designator);
+
+  if (!dataRef)
+    return;
+
+  auto *arrayRef = std::get_if<Fortran::evaluate::ArrayRef>(&dataRef->u);
+  if (!arrayRef)
+    return;
+
+  const auto &subscripts = arrayRef->subscript();
+  int dimension = 0;
+  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
+  fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(symbol);
+
+  for (const auto &subscript : subscripts) {
+    if (dimension != 0)
+      asFortran << ',';
+
+    mlir::Value lbound, extent;
+    mlir::Value arrayLb =
+        fir::factory::readLowerBound(builder, loc, dataExv, dimension, one);
+    mlir::Value arrayExtent =
+        fir::factory::readExtent(builder, loc, dataExv, dimension);
+
+    const auto *triplet = std::get_if<Fortran::evaluate::Triplet>(&subscript.u);
+
+    if (triplet) {
+      asFortran << ':';
+
+      // Compute lower bound (use array lb if not specified).
+      Fortran::semantics::MaybeExpr lowerSexpr =
+          Fortran::evaluate::AsGenericExpr(triplet->lower());
+      mlir::Value lb;
+      if (lowerSexpr) {
+        auto lowerConst = Fortran::evaluate::ToInt64(*lowerSexpr);
+        if (lowerConst) {
+          lb = builder.createIntegerConstant(loc, idxTy, *lowerConst);
+        } else {
+          lb = builder.createConvert(
+              loc, idxTy,
+              fir::getBase(converter.genExprValue(loc, *lowerSexpr, stmtCtx)));
+        }
+      } else {
+        lb = arrayLb;
+      }
+
+      // Compute upper bound (use array ub if not specified).
+      Fortran::semantics::MaybeExpr upperSexpr =
+          Fortran::evaluate::AsGenericExpr(triplet->upper());
+      mlir::Value ub;
+      if (upperSexpr) {
+        auto upperConst = Fortran::evaluate::ToInt64(*upperSexpr);
+        if (upperConst) {
+          ub = builder.createIntegerConstant(loc, idxTy, *upperConst);
+        } else {
+          ub = builder.createConvert(
+              loc, idxTy,
+              fir::getBase(converter.genExprValue(loc, *upperSexpr, stmtCtx)));
+        }
+      } else {
+        // arr(lower:) - upper is array's upper bound
+        ub = mlir::arith::AddIOp::create(
+            builder, loc,
+            mlir::arith::SubIOp::create(builder, loc, arrayLb, one),
+            arrayExtent);
+      }
+
+      // Normalize to zero-based and compute extent.
+      lbound = mlir::arith::SubIOp::create(builder, loc, lb, arrayLb);
+      mlir::Value ubound =
+          mlir::arith::SubIOp::create(builder, loc, ub, arrayLb);
+      extent = mlir::arith::AddIOp::create(
+          builder, loc,
+          mlir::arith::SubIOp::create(builder, loc, ubound, lbound), one);
+    } else {
+      // Single element: arr(elem)
+      using IndirectSubscriptIntegerExpr =
+          Fortran::evaluate::IndirectSubscriptIntegerExpr;
+      using SubscriptInteger = Fortran::evaluate::SubscriptInteger;
+      Fortran::evaluate::Expr<SubscriptInteger> scalarExpr =
+          std::get<IndirectSubscriptIntegerExpr>(subscript.u).value();
+      auto elemConst = Fortran::evaluate::ToInt64(scalarExpr);
+
+      mlir::Value elem;
+      if (elemConst) {
+        elem = builder.createIntegerConstant(loc, idxTy, *elemConst);
+      } else {
+        Fortran::semantics::SomeExpr sexpr =
+            Fortran::evaluate::AsGenericExpr(std::move(scalarExpr));
+        elem = builder.createConvert(
+            loc, idxTy,
+            fir::getBase(converter.genExprValue(loc, sexpr, stmtCtx)));
+      }
+
+      lbound = mlir::arith::SubIOp::create(builder, loc, elem, arrayLb);
+      extent = one;
+    }
+
+    mlir::Value bound = mlir::acc::DataBoundsOp::create(
+        builder, loc, boundTy, lbound, /*upperbound=*/mlir::Value{}, extent,
+        /*stride=*/one, /*strideInBytes=*/false, arrayLb);
+    bounds.push_back(bound);
+    ++dimension;
+  }
+}
+
 static void
 genACC(Fortran::lower::AbstractConverter &converter,
        Fortran::semantics::SemanticsContext &semanticsContext,
        const Fortran::parser::OpenACCCacheConstruct &cacheConstruct) {
-  mlir::Location loc = converter.genLocation(cacheConstruct.source);
-  TODO(loc, "OpenACC cache directive");
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+  // Find enclosing acc.loop
+  auto loopOp = builder.getRegion().getParentOfType<mlir::acc::LoopOp>();
+  if (!loopOp)
+    return;
+
+  // Set insertion point before terminator (after loop variable setup)
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  mlir::Block &loopBody = loopOp.getRegion().front();
+  builder.setInsertionPoint(loopBody.getTerminator());
+
+  const auto &objectListWithModifier =
+      std::get<Fortran::parser::AccObjectListWithModifier>(cacheConstruct.t);
+  const auto &accObjectList =
+      std::get<Fortran::parser::AccObjectList>(objectListWithModifier.t);
+  const auto &modifier =
+      std::get<std::optional<Fortran::parser::AccDataModifier>>(
+          objectListWithModifier.t);
+  mlir::acc::DataClause dataClause =
+      (modifier &&
+       (*modifier).v == Fortran::parser::AccDataModifier::Modifier::ReadOnly)
+          ? mlir::acc::DataClause::acc_cache_readonly
+          : mlir::acc::DataClause::acc_cache;
+
+  Fortran::lower::StatementContext stmtCtx;
+
+  for (const auto &accObject : accObjectList.v) {
+    mlir::Location operandLocation = genOperandLocation(converter, accObject);
+    Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
+
+    std::stringstream asFortran;
+    asFortran << symbol.name().ToString();
+
+    fir::factory::AddrAndBoundsInfo info = getDataOperandBaseAddr(
+        converter, builder, symbol, operandLocation, /*unwrapFirBox=*/true);
+    mlir::Value baseAddr = info.addr;
+
+    llvm::SmallVector<mlir::Value> bounds;
+    genCacheBounds(converter, semanticsContext, stmtCtx, accObject, asFortran,
+                   bounds);
+
+    mlir::acc::CacheOp cacheOp = createDataEntryOp<mlir::acc::CacheOp>(
+        builder, operandLocation, baseAddr, asFortran, bounds,
+        /*structured=*/false, /*implicit=*/false, dataClause,
+        baseAddr.getType(),
+        /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{},
+        /*unwrapBoxAddr=*/true, /*isPresent=*/mlir::Value{});
+
+    // Update symbol map so future lowering uses the cache result
+    Fortran::lower::SymMap &symbolMap = converter.getSymbolMap();
+    if (auto hostDef = symbolMap.lookupVariableDefinition(symbol)) {
+      // Clone the host declare with cache result as input
+      // The first operand is the memref/base for both hlfir::DeclareOp and
+      // fir::DeclareOp
+      mlir::Operation *hostDefOp = (*hostDef).getOperation();
+      mlir::IRMapping mapper;
+      mapper.map(hostDefOp->getOperand(0), cacheOp.getAccVar());
+      mlir::Operation *newDef = builder.clone(*hostDefOp, mapper);
+      symbolMap.addVariableDefinition(
+          symbol, llvm::cast<fir::FortranVariableOpInterface>(newDef));
+    }
+  }
 }
 
 mlir::Value Fortran::lower::genOpenACCConstruct(

>From f6e001269fbcdb7d0d4c0b8f105edb787990e2e6 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 16:54:34 -0800
Subject: [PATCH 2/4] Update the acc-cache test

---
 flang/lib/Lower/OpenACC.cpp                 |  14 +--
 flang/test/Lower/OpenACC/Todo/acc-cache.f90 |  15 ---
 flang/test/Lower/OpenACC/acc-cache.f90      | 113 ++++++++++++++++++++
 3 files changed, 116 insertions(+), 26 deletions(-)
 delete mode 100644 flang/test/Lower/OpenACC/Todo/acc-cache.f90
 create mode 100644 flang/test/Lower/OpenACC/acc-cache.f90

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index a189736c56e40..101e699985532 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4899,9 +4899,7 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
         if (lowerConst) {
           lb = builder.createIntegerConstant(loc, idxTy, *lowerConst);
         } else {
-          lb = builder.createConvert(
-              loc, idxTy,
-              fir::getBase(converter.genExprValue(loc, *lowerSexpr, stmtCtx)));
+          mlir::emitError(loc, "unsupported OpenACC cache subscript");
         }
       } else {
         lb = arrayLb;
@@ -4916,9 +4914,7 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
         if (upperConst) {
           ub = builder.createIntegerConstant(loc, idxTy, *upperConst);
         } else {
-          ub = builder.createConvert(
-              loc, idxTy,
-              fir::getBase(converter.genExprValue(loc, *upperSexpr, stmtCtx)));
+          mlir::emitError(loc, "unsupported OpenACC cache subscript");
         }
       } else {
         // arr(lower:) - upper is array's upper bound
@@ -4948,11 +4944,7 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
       if (elemConst) {
         elem = builder.createIntegerConstant(loc, idxTy, *elemConst);
       } else {
-        Fortran::semantics::SomeExpr sexpr =
-            Fortran::evaluate::AsGenericExpr(std::move(scalarExpr));
-        elem = builder.createConvert(
-            loc, idxTy,
-            fir::getBase(converter.genExprValue(loc, sexpr, stmtCtx)));
+        mlir::emitError(loc, "unsupported OpenACC cache subscript");
       }
 
       lbound = mlir::arith::SubIOp::create(builder, loc, elem, arrayLb);
diff --git a/flang/test/Lower/OpenACC/Todo/acc-cache.f90 b/flang/test/Lower/OpenACC/Todo/acc-cache.f90
deleted file mode 100644
index 8b81e876ed2c9..0000000000000
--- a/flang/test/Lower/OpenACC/Todo/acc-cache.f90
+++ /dev/null
@@ -1,15 +0,0 @@
-! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %s -o - 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: OpenACC cache directive
-
-subroutine test_cache()
-  integer, parameter :: n = 10
-  real, dimension(n) :: a, b
-  integer :: i
-
-  !$acc loop
-  do i = 1, n
-    !$acc cache(b)
-    a(i) = b(i)
-  end do
-end subroutine
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
new file mode 100644
index 0000000000000..ce30f52d0c687
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -0,0 +1,113 @@
+! This test checks lowering of OpenACC cache directive.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+
+! CHECK-LABEL: func.func @_QPtest_cache_basic()
+subroutine test_cache_basic()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b)
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b"
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_basicEb"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_readonly()
+subroutine test_cache_readonly()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(readonly: b)
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_cache_readonly>, name = "b"
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_readonlyEb"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_array_section()
+! For b(2:5): lowerbound = 2-1 = 1, extent = 5-2+1 = 4
+subroutine test_cache_array_section()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b(2:5))
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! CHECK: %[[C2:.*]] = arith.constant 2 : index
+! CHECK: %[[C5:.*]] = arith.constant 5 : index
+! CHECK: %[[LB:.*]] = arith.subi %[[C2]], %[[C1]] : index
+! CHECK: %[[TMP1:.*]] = arith.subi %[[C5]], %[[C1]] : index
+! CHECK: %[[TMP2:.*]] = arith.subi %[[TMP1]], %[[LB]] : index
+! CHECK: %[[EXT:.*]] = arith.addi %[[TMP2]], %[[C1]] : index
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_array_sectionEb"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_multiple()
+subroutine test_cache_multiple()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b, c
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b, c)
+    a(i) = b(i) + c(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: %[[CACHE_B:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b"
+! CHECK: hlfir.declare %[[CACHE_B]](%{{.*}}) {uniq_name = "_QFtest_cache_multipleEb"}
+! CHECK: %[[CACHE_C:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "c"
+! CHECK: hlfir.declare %[[CACHE_C]](%{{.*}}) {uniq_name = "_QFtest_cache_multipleEc"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_2d_array()
+! For b(1:5, 1:5): each dimension has lowerbound = 0, extent = 5
+subroutine test_cache_2d_array()
+  integer, parameter :: n = 10
+  real, dimension(n, n) :: a, b
+  integer :: i, j
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b(1:5, 1:5))
+    do j = 1, n
+      a(i,j) = b(i,j)
+    end do
+  end do
+
+! CHECK: acc.loop
+! Dimension 1: lowerbound = 1-1 = 0, extent = 5-1+1 = 5
+! CHECK: arith.constant 1 : index
+! CHECK: arith.constant 5 : index
+! CHECK: arith.subi
+! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! Dimension 2: lowerbound = 1-1 = 0, extent = 5-1+1 = 5
+! CHECK: arith.constant 5 : index
+! CHECK: arith.subi
+! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_arrayEb"}
+end subroutine

>From 0929d7f49a90535cd3b6d2f4b003c219ddce53ce Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 18:12:04 -0800
Subject: [PATCH 3/4] Support iterators in bounds

---
 flang/lib/Lower/OpenACC.cpp            | 54 ++++++++++----------------
 flang/test/Lower/OpenACC/acc-cache.f90 | 24 ++++++++++++
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 101e699985532..a285707ea6e85 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4870,6 +4870,16 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
   if (!arrayRef)
     return;
 
+  // Helper to generate index value from expression.
+  // Optimize for compile-time constants to generate index type directly.
+  auto genIndex =
+      [&](const Fortran::semantics::MaybeExpr &expr) -> mlir::Value {
+    if (auto constVal = Fortran::evaluate::ToInt64(*expr))
+      return builder.createIntegerConstant(loc, idxTy, *constVal);
+    return builder.createConvert(
+        loc, idxTy, fir::getBase(converter.genExprValue(loc, *expr, stmtCtx)));
+  };
+
   const auto &subscripts = arrayRef->subscript();
   int dimension = 0;
   mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
@@ -4891,31 +4901,16 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
       asFortran << ':';
 
       // Compute lower bound (use array lb if not specified).
-      Fortran::semantics::MaybeExpr lowerSexpr =
+      Fortran::semantics::MaybeExpr lowerExpr =
           Fortran::evaluate::AsGenericExpr(triplet->lower());
-      mlir::Value lb;
-      if (lowerSexpr) {
-        auto lowerConst = Fortran::evaluate::ToInt64(*lowerSexpr);
-        if (lowerConst) {
-          lb = builder.createIntegerConstant(loc, idxTy, *lowerConst);
-        } else {
-          mlir::emitError(loc, "unsupported OpenACC cache subscript");
-        }
-      } else {
-        lb = arrayLb;
-      }
+      mlir::Value lb = lowerExpr ? genIndex(lowerExpr) : arrayLb;
 
       // Compute upper bound (use array ub if not specified).
-      Fortran::semantics::MaybeExpr upperSexpr =
+      Fortran::semantics::MaybeExpr upperExpr =
           Fortran::evaluate::AsGenericExpr(triplet->upper());
       mlir::Value ub;
-      if (upperSexpr) {
-        auto upperConst = Fortran::evaluate::ToInt64(*upperSexpr);
-        if (upperConst) {
-          ub = builder.createIntegerConstant(loc, idxTy, *upperConst);
-        } else {
-          mlir::emitError(loc, "unsupported OpenACC cache subscript");
-        }
+      if (upperExpr) {
+        ub = genIndex(upperExpr);
       } else {
         // arr(lower:) - upper is array's upper bound
         ub = mlir::arith::AddIOp::create(
@@ -4933,19 +4928,12 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
           mlir::arith::SubIOp::create(builder, loc, ubound, lbound), one);
     } else {
       // Single element: arr(elem)
-      using IndirectSubscriptIntegerExpr =
-          Fortran::evaluate::IndirectSubscriptIntegerExpr;
-      using SubscriptInteger = Fortran::evaluate::SubscriptInteger;
-      Fortran::evaluate::Expr<SubscriptInteger> scalarExpr =
-          std::get<IndirectSubscriptIntegerExpr>(subscript.u).value();
-      auto elemConst = Fortran::evaluate::ToInt64(scalarExpr);
-
-      mlir::Value elem;
-      if (elemConst) {
-        elem = builder.createIntegerConstant(loc, idxTy, *elemConst);
-      } else {
-        mlir::emitError(loc, "unsupported OpenACC cache subscript");
-      }
+      Fortran::evaluate::Expr<Fortran::evaluate::SubscriptInteger> scalarExpr =
+          std::get<Fortran::evaluate::IndirectSubscriptIntegerExpr>(subscript.u)
+              .value();
+      Fortran::semantics::MaybeExpr elemExpr =
+          Fortran::evaluate::AsGenericExpr(std::move(scalarExpr));
+      mlir::Value elem = genIndex(elemExpr);
 
       lbound = mlir::arith::SubIOp::create(builder, loc, elem, arrayLb);
       extent = one;
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index ce30f52d0c687..cdf643c2128b6 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -111,3 +111,27 @@ subroutine test_cache_2d_array()
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
 ! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_arrayEb"}
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_loop_var()
+! Test cache with loop variable dependent bounds: b(i:i+2)
+subroutine test_cache_loop_var()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n-2
+    !$acc cache(b(i:i+2))
+    a(i) = b(i) + b(i+1) + b(i+2)
+  end do
+
+! CHECK: acc.loop
+! CHECK: fir.load
+! CHECK: fir.convert
+! CHECK: fir.load
+! CHECK: arith.addi
+! CHECK: fir.convert
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_loop_varEb"}
+end subroutine

>From df7dfce343e82d130ba42fb2342a7e17a53f1789 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 18:44:15 -0800
Subject: [PATCH 4/4] Fix for nested loops

---
 flang/lib/Lower/OpenACC.cpp            |   7 +-
 flang/test/Lower/OpenACC/acc-cache.f90 | 118 ++++++++++++++++++++++---
 2 files changed, 107 insertions(+), 18 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index a285707ea6e85..c8d99f1106249 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4953,16 +4953,11 @@ genACC(Fortran::lower::AbstractConverter &converter,
        const Fortran::parser::OpenACCCacheConstruct &cacheConstruct) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
-  // Find enclosing acc.loop
+  // Verify we're inside an acc.loop region.
   auto loopOp = builder.getRegion().getParentOfType<mlir::acc::LoopOp>();
   if (!loopOp)
     return;
 
-  // Set insertion point before terminator (after loop variable setup)
-  mlir::OpBuilder::InsertionGuard guard(builder);
-  mlir::Block &loopBody = loopOp.getRegion().front();
-  builder.setInsertionPoint(loopBody.getTerminator());
-
   const auto &objectListWithModifier =
       std::get<Fortran::parser::AccObjectListWithModifier>(cacheConstruct.t);
   const auto &accObjectList =
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index cdf643c2128b6..84f8de4ebc714 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -99,15 +99,20 @@ subroutine test_cache_2d_array()
   end do
 
 ! CHECK: acc.loop
-! Dimension 1: lowerbound = 1-1 = 0, extent = 5-1+1 = 5
-! CHECK: arith.constant 1 : index
-! CHECK: arith.constant 5 : index
+! CHECK-DAG: arith.constant 1 : index
+! CHECK-DAG: arith.constant 5 : index
+! Dimension 1: lowerbound = 1-1 = 0, extent = 5-0+1 = 5
+! CHECK: %[[LB1:.*]] = arith.subi %{{.*}}, %{{.*}} : index
+! CHECK: arith.subi
+! CHECK: arith.subi
+! CHECK: arith.addi
+! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! Dimension 2: lowerbound = 1-1 = 0, extent = 5-0+1 = 5
+! CHECK: %[[LB2:.*]] = arith.subi %{{.*}}, %{{.*}} : index
 ! CHECK: arith.subi
-! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! Dimension 2: lowerbound = 1-1 = 0, extent = 5-1+1 = 5
-! CHECK: arith.constant 5 : index
 ! CHECK: arith.subi
-! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: arith.addi
+! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
 ! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_arrayEb"}
 end subroutine
@@ -126,12 +131,101 @@ subroutine test_cache_loop_var()
   end do
 
 ! CHECK: acc.loop
-! CHECK: fir.load
-! CHECK: fir.convert
-! CHECK: fir.load
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! b(i:i+2): lowerbound = i-1, extent = (i+2)-(i)+1 = 3
+! CHECK: fir.convert %{{.*}} : (i64) -> index
 ! CHECK: arith.addi
-! CHECK: fir.convert
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: fir.convert %{{.*}} : (i64) -> index
+! CHECK: %[[LB:.*]] = arith.subi %{{.*}}, %[[C1]] : index
+! CHECK: %[[UB:.*]] = arith.subi %{{.*}}, %[[C1]] : index
+! CHECK: %[[TMP:.*]] = arith.subi %[[UB]], %[[LB]] : index
+! CHECK: %[[EXT:.*]] = arith.addi %[[TMP]], %[[C1]] : index
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
 ! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_loop_varEb"}
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_2d_loop_vars()
+! Test 2D cache with swapped loop variables inside nested loop: b(j:j+1, i:i+1)
+subroutine test_cache_2d_loop_vars()
+  integer, parameter :: n = 10
+  real, dimension(n, n) :: a, b
+  integer :: i, j
+
+  !$acc loop
+  do i = 1, n-1
+    do j = 1, n-1
+      !$acc cache(b(j:j+1, i:i+1))
+      a(i,j) = b(j,i) + b(j+1,i+1)
+    end do
+  end do
+
+! CHECK: acc.loop
+! The cache is generated inside fir.do_loop (the inner j loop)
+! CHECK: fir.do_loop
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! Dimension 1: j to j+1, extent = 2
+! CHECK: %[[LB1:.*]] = arith.subi %{{.*}}, %[[C1]] : index
+! CHECK: arith.subi
+! CHECK: arith.subi
+! CHECK: arith.addi
+! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! Dimension 2: i to i+1, extent = 2
+! CHECK: %[[LB2:.*]] = arith.subi %{{.*}}, %[[C1]] : index
+! CHECK: arith.subi
+! CHECK: arith.subi
+! CHECK: arith.addi
+! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_loop_varsEb"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_single_element()
+! Test cache with single element access: b(i)
+subroutine test_cache_single_element()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b(i))
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! Single element b(i): lowerbound = i-1, extent = 1
+! CHECK: %[[I_IDX:.*]] = fir.convert %{{.*}} : (i64) -> index
+! CHECK: %[[LB:.*]] = arith.subi %[[I_IDX]], %[[C1]] : index
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_single_elementEb"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_mixed_bounds()
+! Test cache with mixed constant and variable bounds: b(1:i)
+subroutine test_cache_mixed_bounds()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b(1:i))
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: arith.constant 1 : index
+! CHECK: arith.constant 1 : index
+! b(1:i): lowerbound = 1-1 = 0, extent = (i-1) - 0 + 1 = i
+! CHECK: fir.convert %{{.*}} : (i64) -> index
+! CHECK: %[[LB:.*]] = arith.subi %{{.*}}, %{{.*}} : index
+! CHECK: %[[UB:.*]] = arith.subi %{{.*}}, %{{.*}} : index
+! CHECK: %[[TMP:.*]] = arith.subi %[[UB]], %[[LB]] : index
+! CHECK: %[[EXT:.*]] = arith.addi %[[TMP]], %{{.*}} : index
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_mixed_boundsEb"}
+end subroutine



More information about the flang-commits mailing list