[flang-commits] [flang] [flang][acc] Implement cache directive lowering (PR #174897)

via flang-commits flang-commits at lists.llvm.org
Fri Jan 9 11:58:30 PST 2026


https://github.com/khaki3 updated https://github.com/llvm/llvm-project/pull/174897

>From d2d2c9db2457f4c7584a4e8c616a6f536458a821 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 14:17:15 -0800
Subject: [PATCH 01/21] [flang][ACC] Implement cache directive lowering

The acc.cache operation is currently defined to be associated with a
loop. However, this implementation generates acc.cache as a standalone
data entry operation, similar to acc.private. The acc.cache operation
definition will be updated in a future change to reflect this usage.

Key implementation details:
- Add genCacheBounds() to generate acc.bounds for cache operands,
  handling single elements (arr(i)), full ranges (arr(l:u)), and
  partial ranges with missing bounds (arr(l:) or arr(:u))
- Generate acc.cache or acc.cache_readonly based on the readonly
  modifier
- Update the symbol map so subsequent lowering uses the cache result
- Insert cache operations after loop iterator setup
---
 flang/lib/Lower/Bridge.cpp  |  12 ++-
 flang/lib/Lower/OpenACC.cpp | 195 +++++++++++++++++++++++++++++++++++-
 2 files changed, 203 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 9224bc2be1028..69bb4a40af41f 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3506,7 +3506,14 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
   void genFIR(const Fortran::parser::OpenACCConstruct &acc) {
     mlir::OpBuilder::InsertPoint insertPt = builder->saveInsertionPoint();
-    localSymbols.pushScope();
+
+    // Cache constructs should not push/pop a scope because they need to update
+    // the symbol map for subsequent statements in the same loop body.
+    bool isCacheConstruct =
+        std::holds_alternative<Fortran::parser::OpenACCCacheConstruct>(acc.u);
+
+    if (!isCacheConstruct)
+      localSymbols.pushScope();
     mlir::Value exitCond = genOpenACCConstruct(
         *this, bridge.getSemanticsContext(), getEval(), acc, localSymbols);
 
@@ -3605,7 +3612,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       for (Fortran::lower::pft::Evaluation &e : curEval->getNestedEvaluations())
         genFIR(e);
     }
-    localSymbols.popScope();
+    if (!isCacheConstruct)
+      localSymbols.popScope();
     builder->restoreInsertionPoint(insertPt);
 
     if (accLoop && exitCond) {
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 52fee7baf9de1..a189736c56e40 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4837,12 +4837,203 @@ genACC(Fortran::lower::AbstractConverter &converter,
       atomicConstruct.u);
 }
 
+/// Generate acc.bounds for cache directive. Handles:
+/// - Single element: arr(i) or arr(5)
+/// - Full range: arr(lower:upper)
+/// - Missing upper: arr(lower:) - uses array's upper bound
+/// - Missing lower: arr(:upper) - uses array's lower bound
+static void
+genCacheBounds(Fortran::lower::AbstractConverter &converter,
+               Fortran::semantics::SemanticsContext &semanticsContext,
+               Fortran::lower::StatementContext &stmtCtx,
+               const Fortran::parser::AccObject &accObject,
+               std::stringstream &asFortran,
+               llvm::SmallVectorImpl<mlir::Value> &bounds) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mlir::Location loc = converter.getCurrentLocation();
+  mlir::Type idxTy = builder.getIndexType();
+  mlir::Type boundTy = builder.getType<mlir::acc::DataBoundsType>();
+
+  Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
+  Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
+
+  std::optional<Fortran::evaluate::DataRef> dataRef;
+  Fortran::semantics::MaybeExpr designator = Fortran::common::visit(
+      [&](auto &&s) { return ea.Analyze(s); }, accObject.u);
+  if (designator)
+    dataRef = Fortran::evaluate::ExtractDataRef(*designator);
+
+  if (!dataRef)
+    return;
+
+  auto *arrayRef = std::get_if<Fortran::evaluate::ArrayRef>(&dataRef->u);
+  if (!arrayRef)
+    return;
+
+  const auto &subscripts = arrayRef->subscript();
+  int dimension = 0;
+  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
+  fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(symbol);
+
+  for (const auto &subscript : subscripts) {
+    if (dimension != 0)
+      asFortran << ',';
+
+    mlir::Value lbound, extent;
+    mlir::Value arrayLb =
+        fir::factory::readLowerBound(builder, loc, dataExv, dimension, one);
+    mlir::Value arrayExtent =
+        fir::factory::readExtent(builder, loc, dataExv, dimension);
+
+    const auto *triplet = std::get_if<Fortran::evaluate::Triplet>(&subscript.u);
+
+    if (triplet) {
+      asFortran << ':';
+
+      // Compute lower bound (use array lb if not specified).
+      Fortran::semantics::MaybeExpr lowerSexpr =
+          Fortran::evaluate::AsGenericExpr(triplet->lower());
+      mlir::Value lb;
+      if (lowerSexpr) {
+        auto lowerConst = Fortran::evaluate::ToInt64(*lowerSexpr);
+        if (lowerConst) {
+          lb = builder.createIntegerConstant(loc, idxTy, *lowerConst);
+        } else {
+          lb = builder.createConvert(
+              loc, idxTy,
+              fir::getBase(converter.genExprValue(loc, *lowerSexpr, stmtCtx)));
+        }
+      } else {
+        lb = arrayLb;
+      }
+
+      // Compute upper bound (use array ub if not specified).
+      Fortran::semantics::MaybeExpr upperSexpr =
+          Fortran::evaluate::AsGenericExpr(triplet->upper());
+      mlir::Value ub;
+      if (upperSexpr) {
+        auto upperConst = Fortran::evaluate::ToInt64(*upperSexpr);
+        if (upperConst) {
+          ub = builder.createIntegerConstant(loc, idxTy, *upperConst);
+        } else {
+          ub = builder.createConvert(
+              loc, idxTy,
+              fir::getBase(converter.genExprValue(loc, *upperSexpr, stmtCtx)));
+        }
+      } else {
+        // arr(lower:) - upper is array's upper bound
+        ub = mlir::arith::AddIOp::create(
+            builder, loc,
+            mlir::arith::SubIOp::create(builder, loc, arrayLb, one),
+            arrayExtent);
+      }
+
+      // Normalize to zero-based and compute extent.
+      lbound = mlir::arith::SubIOp::create(builder, loc, lb, arrayLb);
+      mlir::Value ubound =
+          mlir::arith::SubIOp::create(builder, loc, ub, arrayLb);
+      extent = mlir::arith::AddIOp::create(
+          builder, loc,
+          mlir::arith::SubIOp::create(builder, loc, ubound, lbound), one);
+    } else {
+      // Single element: arr(elem)
+      using IndirectSubscriptIntegerExpr =
+          Fortran::evaluate::IndirectSubscriptIntegerExpr;
+      using SubscriptInteger = Fortran::evaluate::SubscriptInteger;
+      Fortran::evaluate::Expr<SubscriptInteger> scalarExpr =
+          std::get<IndirectSubscriptIntegerExpr>(subscript.u).value();
+      auto elemConst = Fortran::evaluate::ToInt64(scalarExpr);
+
+      mlir::Value elem;
+      if (elemConst) {
+        elem = builder.createIntegerConstant(loc, idxTy, *elemConst);
+      } else {
+        Fortran::semantics::SomeExpr sexpr =
+            Fortran::evaluate::AsGenericExpr(std::move(scalarExpr));
+        elem = builder.createConvert(
+            loc, idxTy,
+            fir::getBase(converter.genExprValue(loc, sexpr, stmtCtx)));
+      }
+
+      lbound = mlir::arith::SubIOp::create(builder, loc, elem, arrayLb);
+      extent = one;
+    }
+
+    mlir::Value bound = mlir::acc::DataBoundsOp::create(
+        builder, loc, boundTy, lbound, /*upperbound=*/mlir::Value{}, extent,
+        /*stride=*/one, /*strideInBytes=*/false, arrayLb);
+    bounds.push_back(bound);
+    ++dimension;
+  }
+}
+
 static void
 genACC(Fortran::lower::AbstractConverter &converter,
        Fortran::semantics::SemanticsContext &semanticsContext,
        const Fortran::parser::OpenACCCacheConstruct &cacheConstruct) {
-  mlir::Location loc = converter.genLocation(cacheConstruct.source);
-  TODO(loc, "OpenACC cache directive");
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+  // Find enclosing acc.loop
+  auto loopOp = builder.getRegion().getParentOfType<mlir::acc::LoopOp>();
+  if (!loopOp)
+    return;
+
+  // Set insertion point before terminator (after loop variable setup)
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  mlir::Block &loopBody = loopOp.getRegion().front();
+  builder.setInsertionPoint(loopBody.getTerminator());
+
+  const auto &objectListWithModifier =
+      std::get<Fortran::parser::AccObjectListWithModifier>(cacheConstruct.t);
+  const auto &accObjectList =
+      std::get<Fortran::parser::AccObjectList>(objectListWithModifier.t);
+  const auto &modifier =
+      std::get<std::optional<Fortran::parser::AccDataModifier>>(
+          objectListWithModifier.t);
+  mlir::acc::DataClause dataClause =
+      (modifier &&
+       (*modifier).v == Fortran::parser::AccDataModifier::Modifier::ReadOnly)
+          ? mlir::acc::DataClause::acc_cache_readonly
+          : mlir::acc::DataClause::acc_cache;
+
+  Fortran::lower::StatementContext stmtCtx;
+
+  for (const auto &accObject : accObjectList.v) {
+    mlir::Location operandLocation = genOperandLocation(converter, accObject);
+    Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
+
+    std::stringstream asFortran;
+    asFortran << symbol.name().ToString();
+
+    fir::factory::AddrAndBoundsInfo info = getDataOperandBaseAddr(
+        converter, builder, symbol, operandLocation, /*unwrapFirBox=*/true);
+    mlir::Value baseAddr = info.addr;
+
+    llvm::SmallVector<mlir::Value> bounds;
+    genCacheBounds(converter, semanticsContext, stmtCtx, accObject, asFortran,
+                   bounds);
+
+    mlir::acc::CacheOp cacheOp = createDataEntryOp<mlir::acc::CacheOp>(
+        builder, operandLocation, baseAddr, asFortran, bounds,
+        /*structured=*/false, /*implicit=*/false, dataClause,
+        baseAddr.getType(),
+        /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{},
+        /*unwrapBoxAddr=*/true, /*isPresent=*/mlir::Value{});
+
+    // Update symbol map so future lowering uses the cache result
+    Fortran::lower::SymMap &symbolMap = converter.getSymbolMap();
+    if (auto hostDef = symbolMap.lookupVariableDefinition(symbol)) {
+      // Clone the host declare with cache result as input
+      // The first operand is the memref/base for both hlfir::DeclareOp and
+      // fir::DeclareOp
+      mlir::Operation *hostDefOp = (*hostDef).getOperation();
+      mlir::IRMapping mapper;
+      mapper.map(hostDefOp->getOperand(0), cacheOp.getAccVar());
+      mlir::Operation *newDef = builder.clone(*hostDefOp, mapper);
+      symbolMap.addVariableDefinition(
+          symbol, llvm::cast<fir::FortranVariableOpInterface>(newDef));
+    }
+  }
 }
 
 mlir::Value Fortran::lower::genOpenACCConstruct(

>From b0f9bf81b4abeef3508149f82df5b7aa10edc5dd Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 16:54:34 -0800
Subject: [PATCH 02/21] Update the acc-cache test

---
 flang/lib/Lower/OpenACC.cpp                 |  14 +--
 flang/test/Lower/OpenACC/Todo/acc-cache.f90 |  15 ---
 flang/test/Lower/OpenACC/acc-cache.f90      | 113 ++++++++++++++++++++
 3 files changed, 116 insertions(+), 26 deletions(-)
 delete mode 100644 flang/test/Lower/OpenACC/Todo/acc-cache.f90
 create mode 100644 flang/test/Lower/OpenACC/acc-cache.f90

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index a189736c56e40..101e699985532 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4899,9 +4899,7 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
         if (lowerConst) {
           lb = builder.createIntegerConstant(loc, idxTy, *lowerConst);
         } else {
-          lb = builder.createConvert(
-              loc, idxTy,
-              fir::getBase(converter.genExprValue(loc, *lowerSexpr, stmtCtx)));
+          mlir::emitError(loc, "unsupported OpenACC cache subscript");
         }
       } else {
         lb = arrayLb;
@@ -4916,9 +4914,7 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
         if (upperConst) {
           ub = builder.createIntegerConstant(loc, idxTy, *upperConst);
         } else {
-          ub = builder.createConvert(
-              loc, idxTy,
-              fir::getBase(converter.genExprValue(loc, *upperSexpr, stmtCtx)));
+          mlir::emitError(loc, "unsupported OpenACC cache subscript");
         }
       } else {
         // arr(lower:) - upper is array's upper bound
@@ -4948,11 +4944,7 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
       if (elemConst) {
         elem = builder.createIntegerConstant(loc, idxTy, *elemConst);
       } else {
-        Fortran::semantics::SomeExpr sexpr =
-            Fortran::evaluate::AsGenericExpr(std::move(scalarExpr));
-        elem = builder.createConvert(
-            loc, idxTy,
-            fir::getBase(converter.genExprValue(loc, sexpr, stmtCtx)));
+        mlir::emitError(loc, "unsupported OpenACC cache subscript");
       }
 
       lbound = mlir::arith::SubIOp::create(builder, loc, elem, arrayLb);
diff --git a/flang/test/Lower/OpenACC/Todo/acc-cache.f90 b/flang/test/Lower/OpenACC/Todo/acc-cache.f90
deleted file mode 100644
index 8b81e876ed2c9..0000000000000
--- a/flang/test/Lower/OpenACC/Todo/acc-cache.f90
+++ /dev/null
@@ -1,15 +0,0 @@
-! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %s -o - 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: OpenACC cache directive
-
-subroutine test_cache()
-  integer, parameter :: n = 10
-  real, dimension(n) :: a, b
-  integer :: i
-
-  !$acc loop
-  do i = 1, n
-    !$acc cache(b)
-    a(i) = b(i)
-  end do
-end subroutine
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
new file mode 100644
index 0000000000000..ce30f52d0c687
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -0,0 +1,113 @@
+! This test checks lowering of OpenACC cache directive.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+
+! CHECK-LABEL: func.func @_QPtest_cache_basic()
+subroutine test_cache_basic()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b)
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b"
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_basicEb"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_readonly()
+subroutine test_cache_readonly()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(readonly: b)
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_cache_readonly>, name = "b"
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_readonlyEb"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_array_section()
+! For b(2:5): lowerbound = 2-1 = 1, extent = 5-2+1 = 4
+subroutine test_cache_array_section()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b(2:5))
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! CHECK: %[[C2:.*]] = arith.constant 2 : index
+! CHECK: %[[C5:.*]] = arith.constant 5 : index
+! CHECK: %[[LB:.*]] = arith.subi %[[C2]], %[[C1]] : index
+! CHECK: %[[TMP1:.*]] = arith.subi %[[C5]], %[[C1]] : index
+! CHECK: %[[TMP2:.*]] = arith.subi %[[TMP1]], %[[LB]] : index
+! CHECK: %[[EXT:.*]] = arith.addi %[[TMP2]], %[[C1]] : index
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_array_sectionEb"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_multiple()
+subroutine test_cache_multiple()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b, c
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b, c)
+    a(i) = b(i) + c(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: %[[CACHE_B:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b"
+! CHECK: hlfir.declare %[[CACHE_B]](%{{.*}}) {uniq_name = "_QFtest_cache_multipleEb"}
+! CHECK: %[[CACHE_C:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "c"
+! CHECK: hlfir.declare %[[CACHE_C]](%{{.*}}) {uniq_name = "_QFtest_cache_multipleEc"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_2d_array()
+! For b(1:5, 1:5): each dimension has lowerbound = 0, extent = 5
+subroutine test_cache_2d_array()
+  integer, parameter :: n = 10
+  real, dimension(n, n) :: a, b
+  integer :: i, j
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b(1:5, 1:5))
+    do j = 1, n
+      a(i,j) = b(i,j)
+    end do
+  end do
+
+! CHECK: acc.loop
+! Dimension 1: lowerbound = 1-1 = 0, extent = 5-1+1 = 5
+! CHECK: arith.constant 1 : index
+! CHECK: arith.constant 5 : index
+! CHECK: arith.subi
+! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! Dimension 2: lowerbound = 1-1 = 0, extent = 5-1+1 = 5
+! CHECK: arith.constant 5 : index
+! CHECK: arith.subi
+! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_arrayEb"}
+end subroutine

>From b9350fba050f0cf0085e29078d03f009768a4c15 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 18:12:04 -0800
Subject: [PATCH 03/21] Support iterators in bounds

---
 flang/lib/Lower/OpenACC.cpp            | 54 ++++++++++----------------
 flang/test/Lower/OpenACC/acc-cache.f90 | 24 ++++++++++++
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 101e699985532..a285707ea6e85 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4870,6 +4870,16 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
   if (!arrayRef)
     return;
 
+  // Helper to generate index value from expression.
+  // Optimize for compile-time constants to generate index type directly.
+  auto genIndex =
+      [&](const Fortran::semantics::MaybeExpr &expr) -> mlir::Value {
+    if (auto constVal = Fortran::evaluate::ToInt64(*expr))
+      return builder.createIntegerConstant(loc, idxTy, *constVal);
+    return builder.createConvert(
+        loc, idxTy, fir::getBase(converter.genExprValue(loc, *expr, stmtCtx)));
+  };
+
   const auto &subscripts = arrayRef->subscript();
   int dimension = 0;
   mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
@@ -4891,31 +4901,16 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
       asFortran << ':';
 
       // Compute lower bound (use array lb if not specified).
-      Fortran::semantics::MaybeExpr lowerSexpr =
+      Fortran::semantics::MaybeExpr lowerExpr =
           Fortran::evaluate::AsGenericExpr(triplet->lower());
-      mlir::Value lb;
-      if (lowerSexpr) {
-        auto lowerConst = Fortran::evaluate::ToInt64(*lowerSexpr);
-        if (lowerConst) {
-          lb = builder.createIntegerConstant(loc, idxTy, *lowerConst);
-        } else {
-          mlir::emitError(loc, "unsupported OpenACC cache subscript");
-        }
-      } else {
-        lb = arrayLb;
-      }
+      mlir::Value lb = lowerExpr ? genIndex(lowerExpr) : arrayLb;
 
       // Compute upper bound (use array ub if not specified).
-      Fortran::semantics::MaybeExpr upperSexpr =
+      Fortran::semantics::MaybeExpr upperExpr =
           Fortran::evaluate::AsGenericExpr(triplet->upper());
       mlir::Value ub;
-      if (upperSexpr) {
-        auto upperConst = Fortran::evaluate::ToInt64(*upperSexpr);
-        if (upperConst) {
-          ub = builder.createIntegerConstant(loc, idxTy, *upperConst);
-        } else {
-          mlir::emitError(loc, "unsupported OpenACC cache subscript");
-        }
+      if (upperExpr) {
+        ub = genIndex(upperExpr);
       } else {
         // arr(lower:) - upper is array's upper bound
         ub = mlir::arith::AddIOp::create(
@@ -4933,19 +4928,12 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
           mlir::arith::SubIOp::create(builder, loc, ubound, lbound), one);
     } else {
       // Single element: arr(elem)
-      using IndirectSubscriptIntegerExpr =
-          Fortran::evaluate::IndirectSubscriptIntegerExpr;
-      using SubscriptInteger = Fortran::evaluate::SubscriptInteger;
-      Fortran::evaluate::Expr<SubscriptInteger> scalarExpr =
-          std::get<IndirectSubscriptIntegerExpr>(subscript.u).value();
-      auto elemConst = Fortran::evaluate::ToInt64(scalarExpr);
-
-      mlir::Value elem;
-      if (elemConst) {
-        elem = builder.createIntegerConstant(loc, idxTy, *elemConst);
-      } else {
-        mlir::emitError(loc, "unsupported OpenACC cache subscript");
-      }
+      Fortran::evaluate::Expr<Fortran::evaluate::SubscriptInteger> scalarExpr =
+          std::get<Fortran::evaluate::IndirectSubscriptIntegerExpr>(subscript.u)
+              .value();
+      Fortran::semantics::MaybeExpr elemExpr =
+          Fortran::evaluate::AsGenericExpr(std::move(scalarExpr));
+      mlir::Value elem = genIndex(elemExpr);
 
       lbound = mlir::arith::SubIOp::create(builder, loc, elem, arrayLb);
       extent = one;
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index ce30f52d0c687..cdf643c2128b6 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -111,3 +111,27 @@ subroutine test_cache_2d_array()
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
 ! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_arrayEb"}
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_loop_var()
+! Test cache with loop variable dependent bounds: b(i:i+2)
+subroutine test_cache_loop_var()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n-2
+    !$acc cache(b(i:i+2))
+    a(i) = b(i) + b(i+1) + b(i+2)
+  end do
+
+! CHECK: acc.loop
+! CHECK: fir.load
+! CHECK: fir.convert
+! CHECK: fir.load
+! CHECK: arith.addi
+! CHECK: fir.convert
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_loop_varEb"}
+end subroutine

>From 1e1a7443e8e09a31b84a88993504888f355a5a65 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 18:44:15 -0800
Subject: [PATCH 04/21] Fix for nested loops

---
 flang/lib/Lower/OpenACC.cpp            |   7 +-
 flang/test/Lower/OpenACC/acc-cache.f90 | 118 ++++++++++++++++++++++---
 2 files changed, 107 insertions(+), 18 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index a285707ea6e85..c8d99f1106249 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4953,16 +4953,11 @@ genACC(Fortran::lower::AbstractConverter &converter,
        const Fortran::parser::OpenACCCacheConstruct &cacheConstruct) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
-  // Find enclosing acc.loop
+  // Verify we're inside an acc.loop region.
   auto loopOp = builder.getRegion().getParentOfType<mlir::acc::LoopOp>();
   if (!loopOp)
     return;
 
-  // Set insertion point before terminator (after loop variable setup)
-  mlir::OpBuilder::InsertionGuard guard(builder);
-  mlir::Block &loopBody = loopOp.getRegion().front();
-  builder.setInsertionPoint(loopBody.getTerminator());
-
   const auto &objectListWithModifier =
       std::get<Fortran::parser::AccObjectListWithModifier>(cacheConstruct.t);
   const auto &accObjectList =
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index cdf643c2128b6..84f8de4ebc714 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -99,15 +99,20 @@ subroutine test_cache_2d_array()
   end do
 
 ! CHECK: acc.loop
-! Dimension 1: lowerbound = 1-1 = 0, extent = 5-1+1 = 5
-! CHECK: arith.constant 1 : index
-! CHECK: arith.constant 5 : index
+! CHECK-DAG: arith.constant 1 : index
+! CHECK-DAG: arith.constant 5 : index
+! Dimension 1: lowerbound = 1-1 = 0, extent = 5-0+1 = 5
+! CHECK: %[[LB1:.*]] = arith.subi %{{.*}}, %{{.*}} : index
+! CHECK: arith.subi
+! CHECK: arith.subi
+! CHECK: arith.addi
+! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! Dimension 2: lowerbound = 1-1 = 0, extent = 5-0+1 = 5
+! CHECK: %[[LB2:.*]] = arith.subi %{{.*}}, %{{.*}} : index
 ! CHECK: arith.subi
-! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! Dimension 2: lowerbound = 1-1 = 0, extent = 5-1+1 = 5
-! CHECK: arith.constant 5 : index
 ! CHECK: arith.subi
-! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: arith.addi
+! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
 ! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_arrayEb"}
 end subroutine
@@ -126,12 +131,101 @@ subroutine test_cache_loop_var()
   end do
 
 ! CHECK: acc.loop
-! CHECK: fir.load
-! CHECK: fir.convert
-! CHECK: fir.load
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! b(i:i+2): lowerbound = i-1, extent = (i+2)-(i)+1 = 3
+! CHECK: fir.convert %{{.*}} : (i64) -> index
 ! CHECK: arith.addi
-! CHECK: fir.convert
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: fir.convert %{{.*}} : (i64) -> index
+! CHECK: %[[LB:.*]] = arith.subi %{{.*}}, %[[C1]] : index
+! CHECK: %[[UB:.*]] = arith.subi %{{.*}}, %[[C1]] : index
+! CHECK: %[[TMP:.*]] = arith.subi %[[UB]], %[[LB]] : index
+! CHECK: %[[EXT:.*]] = arith.addi %[[TMP]], %[[C1]] : index
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
 ! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_loop_varEb"}
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_2d_loop_vars()
+! Test 2D cache with swapped loop variables inside nested loop: b(j:j+1, i:i+1)
+subroutine test_cache_2d_loop_vars()
+  integer, parameter :: n = 10
+  real, dimension(n, n) :: a, b
+  integer :: i, j
+
+  !$acc loop
+  do i = 1, n-1
+    do j = 1, n-1
+      !$acc cache(b(j:j+1, i:i+1))
+      a(i,j) = b(j,i) + b(j+1,i+1)
+    end do
+  end do
+
+! CHECK: acc.loop
+! The cache is generated inside fir.do_loop (the inner j loop)
+! CHECK: fir.do_loop
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! Dimension 1: j to j+1, extent = 2
+! CHECK: %[[LB1:.*]] = arith.subi %{{.*}}, %[[C1]] : index
+! CHECK: arith.subi
+! CHECK: arith.subi
+! CHECK: arith.addi
+! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! Dimension 2: i to i+1, extent = 2
+! CHECK: %[[LB2:.*]] = arith.subi %{{.*}}, %[[C1]] : index
+! CHECK: arith.subi
+! CHECK: arith.subi
+! CHECK: arith.addi
+! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_loop_varsEb"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_single_element()
+! Test cache with single element access: b(i)
+subroutine test_cache_single_element()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b(i))
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! Single element b(i): lowerbound = i-1, extent = 1
+! CHECK: %[[I_IDX:.*]] = fir.convert %{{.*}} : (i64) -> index
+! CHECK: %[[LB:.*]] = arith.subi %[[I_IDX]], %[[C1]] : index
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_single_elementEb"}
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_mixed_bounds()
+! Test cache with mixed constant and variable bounds: b(1:i)
+subroutine test_cache_mixed_bounds()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b(1:i))
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: arith.constant 1 : index
+! CHECK: arith.constant 1 : index
+! b(1:i): lowerbound = 1-1 = 0, extent = (i-1) - 0 + 1 = i
+! CHECK: fir.convert %{{.*}} : (i64) -> index
+! CHECK: %[[LB:.*]] = arith.subi %{{.*}}, %{{.*}} : index
+! CHECK: %[[UB:.*]] = arith.subi %{{.*}}, %{{.*}} : index
+! CHECK: %[[TMP:.*]] = arith.subi %[[UB]], %[[LB]] : index
+! CHECK: %[[EXT:.*]] = arith.addi %[[TMP]], %{{.*}} : index
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_mixed_boundsEb"}
+end subroutine

>From a565d9ae098a621decf5e49ea1ecd0187deeef7a Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 19:50:17 -0800
Subject: [PATCH 05/21] Improve acc-cache.f90 tests for iterator bounds
 verification

---
 flang/test/Lower/OpenACC/acc-cache.f90 | 133 +++++++++++++++++--------
 1 file changed, 92 insertions(+), 41 deletions(-)

diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index 84f8de4ebc714..67af65ee10cfa 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -39,7 +39,7 @@ subroutine test_cache_readonly()
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_array_section()
-! For b(2:5): lowerbound = 2-1 = 1, extent = 5-2+1 = 4
+! For b(2:5) with startIdx=1: lowerbound = 2-1 = 1, upperbound = 5-1 = 4, extent = 4
 subroutine test_cache_array_section()
   integer, parameter :: n = 10
   real, dimension(n) :: a, b
@@ -84,7 +84,7 @@ subroutine test_cache_multiple()
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_2d_array()
-! For b(1:5, 1:5): each dimension has lowerbound = 0, extent = 5
+! For b(1:5, 1:5) with startIdx=1: each dimension has lowerbound = 1-1 = 0, upperbound = 5-1 = 4, extent = 5
 subroutine test_cache_2d_array()
   integer, parameter :: n = 10
   real, dimension(n, n) :: a, b
@@ -101,13 +101,13 @@ subroutine test_cache_2d_array()
 ! CHECK: acc.loop
 ! CHECK-DAG: arith.constant 1 : index
 ! CHECK-DAG: arith.constant 5 : index
-! Dimension 1: lowerbound = 1-1 = 0, extent = 5-0+1 = 5
+! Dimension 1: lowerbound = 1 - startIdx = 0, upperbound = 5 - startIdx = 4, extent = 5
 ! CHECK: %[[LB1:.*]] = arith.subi %{{.*}}, %{{.*}} : index
 ! CHECK: arith.subi
 ! CHECK: arith.subi
 ! CHECK: arith.addi
 ! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! Dimension 2: lowerbound = 1-1 = 0, extent = 5-0+1 = 5
+! Dimension 2: lowerbound = 1 - startIdx = 0, upperbound = 5 - startIdx = 4, extent = 5
 ! CHECK: %[[LB2:.*]] = arith.subi %{{.*}}, %{{.*}} : index
 ! CHECK: arith.subi
 ! CHECK: arith.subi
@@ -130,16 +130,29 @@ subroutine test_cache_loop_var()
     a(i) = b(i) + b(i+1) + b(i+2)
   end do
 
-! CHECK: acc.loop
+! CHECK: acc.loop private({{.*}}) control(%[[IV:.*]] : i32) = ({{.*}}) to ({{.*}})
+! The privatized iterator is declared and initialized from the loop control variable
+! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_cache_loop_varEi"}
+! CHECK: fir.store %[[IV]] to %[[I_DECL]]#0 : !fir.ref<i32>
+! Bounds generation loads the iterator and converts it to index
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
-! b(i:i+2): lowerbound = i-1, extent = (i+2)-(i)+1 = 3
-! CHECK: fir.convert %{{.*}} : (i64) -> index
-! CHECK: arith.addi
-! CHECK: fir.convert %{{.*}} : (i64) -> index
-! CHECK: %[[LB:.*]] = arith.subi %{{.*}}, %[[C1]] : index
-! CHECK: %[[UB:.*]] = arith.subi %{{.*}}, %[[C1]] : index
-! CHECK: %[[TMP:.*]] = arith.subi %[[UB]], %[[LB]] : index
-! CHECK: %[[EXT:.*]] = arith.addi %[[TMP]], %[[C1]] : index
+! Load i for lower bound (i)
+! CHECK: %[[I_LOAD1:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+! CHECK: %[[I_I64_1:.*]] = fir.convert %[[I_LOAD1]] : (i32) -> i64
+! CHECK: %[[I_IDX_1:.*]] = fir.convert %[[I_I64_1]] : (i64) -> index
+! Load i for upper bound (i+2)
+! CHECK: %[[I_LOAD2:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+! CHECK: %[[C2_I32:.*]] = arith.constant 2 : i32
+! CHECK: %[[I_PLUS_2:.*]] = arith.addi %[[I_LOAD2]], %[[C2_I32]] : i32
+! CHECK: %[[UB_I64:.*]] = fir.convert %[[I_PLUS_2]] : (i32) -> i64
+! CHECK: %[[UB_IDX:.*]] = fir.convert %[[UB_I64]] : (i64) -> index
+! Compute lowerbound = i - startIdx (offset from startIdx)
+! CHECK: %[[LB:.*]] = arith.subi %[[I_IDX_1]], %[[C1]] : index
+! Compute upperbound = (i+2) - startIdx (offset from startIdx)
+! CHECK: %[[UB:.*]] = arith.subi %[[UB_IDX]], %[[C1]] : index
+! Compute extent = ub - lb + 1
+! CHECK: %[[DIFF:.*]] = arith.subi %[[UB]], %[[LB]] : index
+! CHECK: %[[EXT:.*]] = arith.addi %[[DIFF]], %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
 ! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_loop_varEb"}
@@ -160,22 +173,45 @@ subroutine test_cache_2d_loop_vars()
     end do
   end do
 
-! CHECK: acc.loop
-! The cache is generated inside fir.do_loop (the inner j loop)
-! CHECK: fir.do_loop
+! CHECK: acc.loop private({{.*}}) control(%[[I_IV:.*]] : i32) = ({{.*}}) to ({{.*}})
+! Outer loop iterator i is stored to privatized variable
+! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_cache_2d_loop_varsEi"}
+! CHECK: fir.store %[[I_IV]] to %[[I_DECL]]#0 : !fir.ref<i32>
+! Inner loop j (non-acc loop, fir.do_loop)
+! CHECK: fir.do_loop %[[J_IV:.*]] = {{.*}} iter_args(%[[J_ITER:.*]] = {{.*}})
+! Inner loop iterator j is stored to j variable
+! CHECK: fir.store %[[J_ITER]] to %[[J_REF:.*]] : !fir.ref<i32>
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
-! Dimension 1: j to j+1, extent = 2
-! CHECK: %[[LB1:.*]] = arith.subi %{{.*}}, %[[C1]] : index
-! CHECK: arith.subi
-! CHECK: arith.subi
-! CHECK: arith.addi
-! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! Dimension 2: i to i+1, extent = 2
-! CHECK: %[[LB2:.*]] = arith.subi %{{.*}}, %[[C1]] : index
-! CHECK: arith.subi
-! CHECK: arith.subi
-! CHECK: arith.addi
-! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! Dimension 1 bounds from j: load j, convert, compute j+1
+! CHECK: %[[J_LOAD1:.*]] = fir.load %[[J_REF]] : !fir.ref<i32>
+! CHECK: %[[J_I64_1:.*]] = fir.convert %[[J_LOAD1]] : (i32) -> i64
+! CHECK: %[[J_IDX_1:.*]] = fir.convert %[[J_I64_1]] : (i64) -> index
+! CHECK: %[[J_LOAD2:.*]] = fir.load %[[J_REF]] : !fir.ref<i32>
+! CHECK: %[[C1_I32_J:.*]] = arith.constant 1 : i32
+! CHECK: %[[J_PLUS_1:.*]] = arith.addi %[[J_LOAD2]], %[[C1_I32_J]] : i32
+! CHECK: %[[J_PLUS_1_I64:.*]] = fir.convert %[[J_PLUS_1]] : (i32) -> i64
+! CHECK: %[[J_PLUS_1_IDX:.*]] = fir.convert %[[J_PLUS_1_I64]] : (i64) -> index
+! Compute lowerbound = j - 1, upperbound = (j+1) - 1, extent = 2
+! CHECK: %[[LB1:.*]] = arith.subi %[[J_IDX_1]], %[[C1]] : index
+! CHECK: %[[UB1:.*]] = arith.subi %[[J_PLUS_1_IDX]], %[[C1]] : index
+! CHECK: %[[DIFF1:.*]] = arith.subi %[[UB1]], %[[LB1]] : index
+! CHECK: %[[EXT1:.*]] = arith.addi %[[DIFF1]], %[[C1]] : index
+! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) extent(%[[EXT1]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! Dimension 2 bounds from i (outer loop): load i, convert, compute i+1
+! CHECK: %[[I_LOAD1:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+! CHECK: %[[I_I64_1:.*]] = fir.convert %[[I_LOAD1]] : (i32) -> i64
+! CHECK: %[[I_IDX_1:.*]] = fir.convert %[[I_I64_1]] : (i64) -> index
+! CHECK: %[[I_LOAD2:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+! CHECK: %[[C1_I32_I:.*]] = arith.constant 1 : i32
+! CHECK: %[[I_PLUS_1:.*]] = arith.addi %[[I_LOAD2]], %[[C1_I32_I]] : i32
+! CHECK: %[[I_PLUS_1_I64:.*]] = fir.convert %[[I_PLUS_1]] : (i32) -> i64
+! CHECK: %[[I_PLUS_1_IDX:.*]] = fir.convert %[[I_PLUS_1_I64]] : (i64) -> index
+! Compute lowerbound = i - 1, upperbound = (i+1) - 1, extent = 2
+! CHECK: %[[LB2:.*]] = arith.subi %[[I_IDX_1]], %[[C1]] : index
+! CHECK: %[[UB2:.*]] = arith.subi %[[I_PLUS_1_IDX]], %[[C1]] : index
+! CHECK: %[[DIFF2:.*]] = arith.subi %[[UB2]], %[[LB2]] : index
+! CHECK: %[[EXT2:.*]] = arith.addi %[[DIFF2]], %[[C1]] : index
+! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%[[EXT2]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
 ! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_loop_varsEb"}
 end subroutine
@@ -193,10 +229,16 @@ subroutine test_cache_single_element()
     a(i) = b(i)
   end do
 
-! CHECK: acc.loop
+! CHECK: acc.loop private({{.*}}) control(%[[IV:.*]] : i32) = ({{.*}}) to ({{.*}})
+! The privatized iterator is declared and initialized from the loop control variable
+! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_cache_single_elementEi"}
+! CHECK: fir.store %[[IV]] to %[[I_DECL]]#0 : !fir.ref<i32>
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
-! Single element b(i): lowerbound = i-1, extent = 1
-! CHECK: %[[I_IDX:.*]] = fir.convert %{{.*}} : (i64) -> index
+! Load i from the iterator variable and convert to index
+! CHECK: %[[I_LOAD:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+! CHECK: %[[I_I64:.*]] = fir.convert %[[I_LOAD]] : (i32) -> i64
+! CHECK: %[[I_IDX:.*]] = fir.convert %[[I_I64]] : (i64) -> index
+! Compute lowerbound = i - startIdx (offset from startIdx), extent = 1 for single element
 ! CHECK: %[[LB:.*]] = arith.subi %[[I_IDX]], %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
@@ -216,16 +258,25 @@ subroutine test_cache_mixed_bounds()
     a(i) = b(i)
   end do
 
-! CHECK: acc.loop
-! CHECK: arith.constant 1 : index
-! CHECK: arith.constant 1 : index
-! b(1:i): lowerbound = 1-1 = 0, extent = (i-1) - 0 + 1 = i
-! CHECK: fir.convert %{{.*}} : (i64) -> index
-! CHECK: %[[LB:.*]] = arith.subi %{{.*}}, %{{.*}} : index
-! CHECK: %[[UB:.*]] = arith.subi %{{.*}}, %{{.*}} : index
-! CHECK: %[[TMP:.*]] = arith.subi %[[UB]], %[[LB]] : index
-! CHECK: %[[EXT:.*]] = arith.addi %[[TMP]], %{{.*}} : index
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! CHECK: acc.loop private({{.*}}) control(%[[IV:.*]] : i32) = ({{.*}}) to ({{.*}})
+! The privatized iterator is declared and initialized
+! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_cache_mixed_boundsEi"}
+! CHECK: fir.store %[[IV]] to %[[I_DECL]]#0 : !fir.ref<i32>
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! b(1:i): lower bound is constant 1
+! CHECK: %[[C1_LB:.*]] = arith.constant 1 : index
+! Upper bound i is loaded from iterator variable
+! CHECK: %[[I_LOAD:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+! CHECK: %[[I_I64:.*]] = fir.convert %[[I_LOAD]] : (i32) -> i64
+! CHECK: %[[I_IDX:.*]] = fir.convert %[[I_I64]] : (i64) -> index
+! Compute lowerbound = 1 - startIdx = 0 (constant offset)
+! CHECK: %[[LB:.*]] = arith.subi %[[C1_LB]], %[[C1]] : index
+! Compute upperbound = i - startIdx (offset from startIdx, uses iterator)
+! CHECK: %[[UB:.*]] = arith.subi %[[I_IDX]], %[[C1]] : index
+! Compute extent = ub - lb + 1 = i (dynamic based on iterator)
+! CHECK: %[[DIFF:.*]] = arith.subi %[[UB]], %[[LB]] : index
+! CHECK: %[[EXT:.*]] = arith.addi %[[DIFF]], %[[C1]] : index
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
 ! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_mixed_boundsEb"}
 end subroutine

>From 6359cb4238ef07406679fcad21d328ae21a4c57a Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 20:19:25 -0800
Subject: [PATCH 06/21] Detect invalid bounds

---
 flang/lib/Lower/OpenACC.cpp | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index c8d99f1106249..f88b42b530ffd 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4900,14 +4900,30 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
     if (triplet) {
       asFortran << ':';
 
-      // Compute lower bound (use array lb if not specified).
+      // OpenACC spec requires at least one of lower or upper bound to be
+      // specified: arr(lower:upper), arr(lower:), or arr(:upper).
+      // arr(:) with both bounds missing is not allowed.
       Fortran::semantics::MaybeExpr lowerExpr =
           Fortran::evaluate::AsGenericExpr(triplet->lower());
+      Fortran::semantics::MaybeExpr upperExpr =
+          Fortran::evaluate::AsGenericExpr(triplet->upper());
+
+      if (!lowerExpr && !upperExpr) {
+        mlir::emitError(loc, "OpenACC cache directive requires at least one "
+                             "bound to be specified for array section");
+      }
+
+      // OpenACC cache does not support strided array sections.
+      if (auto strideVal = Fortran::evaluate::ToInt64(triplet->stride())) {
+        if (*strideVal != 1)
+          mlir::emitError(loc, "OpenACC cache directive does not support "
+                               "strided array sections");
+      }
+
+      // Compute lower bound (use array lb if not specified).
       mlir::Value lb = lowerExpr ? genIndex(lowerExpr) : arrayLb;
 
       // Compute upper bound (use array ub if not specified).
-      Fortran::semantics::MaybeExpr upperExpr =
-          Fortran::evaluate::AsGenericExpr(triplet->upper());
       mlir::Value ub;
       if (upperExpr) {
         ub = genIndex(upperExpr);

>From bf9a4535a17099af88e51fa6ce15015e42bc9a5e Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 7 Jan 2026 20:37:45 -0800
Subject: [PATCH 07/21] Use report_fatal_error

---
 flang/lib/Lower/OpenACC.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index f88b42b530ffd..472c844130d30 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4909,15 +4909,15 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
           Fortran::evaluate::AsGenericExpr(triplet->upper());
 
       if (!lowerExpr && !upperExpr) {
-        mlir::emitError(loc, "OpenACC cache directive requires at least one "
-                             "bound to be specified for array section");
+        llvm::report_fatal_error("OpenACC cache directive requires at least "
+                                 "one bound to be specified for array section");
       }
 
-      // OpenACC cache does not support strided array sections.
-      if (auto strideVal = Fortran::evaluate::ToInt64(triplet->stride())) {
-        if (*strideVal != 1)
-          mlir::emitError(loc, "OpenACC cache directive does not support "
-                               "strided array sections");
+      // OpenACC cache only supports unit stride (default or explicit 1).
+      auto strideVal = Fortran::evaluate::ToInt64(triplet->stride());
+      if (!strideVal || *strideVal != 1) {
+        llvm::report_fatal_error("OpenACC cache directive does not support "
+                                 "strided array sections");
       }
 
       // Compute lower bound (use array lb if not specified).

>From 92726c0e49c4c6fd9c0128bbda13771ab8a685d9 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 8 Jan 2026 00:40:02 -0800
Subject: [PATCH 08/21] Support non-1 lower bounds; disregard fir::DeclareOp

---
 flang/lib/Lower/OpenACC.cpp            | 34 ++++++++++++++++----------
 flang/test/Lower/OpenACC/acc-cache.f90 | 27 ++++++++++++++++++++
 2 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 472c844130d30..b307a0f2ce932 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4998,7 +4998,14 @@ genACC(Fortran::lower::AbstractConverter &converter,
 
     fir::factory::AddrAndBoundsInfo info = getDataOperandBaseAddr(
         converter, builder, symbol, operandLocation, /*unwrapFirBox=*/true);
-    mlir::Value baseAddr = info.addr;
+
+    // For arrays with non-1 lower bounds, info.addr is a box type.
+    // Use rawInput (the underlying ref) when addr and rawInput have different
+    // element types, similar to how other data clauses handle this case.
+    bool useRawInput =
+        info.rawInput && fir::unwrapRefType(info.addr.getType()) !=
+                             fir::unwrapRefType(info.rawInput.getType());
+    mlir::Value baseAddr = useRawInput ? info.rawInput : info.addr;
 
     llvm::SmallVector<mlir::Value> bounds;
     genCacheBounds(converter, semanticsContext, stmtCtx, accObject, asFortran,
@@ -5011,19 +5018,20 @@ genACC(Fortran::lower::AbstractConverter &converter,
         /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{},
         /*unwrapBoxAddr=*/true, /*isPresent=*/mlir::Value{});
 
-    // Update symbol map so future lowering uses the cache result
+    // Update symbol map so future lowering uses the cache result.
     Fortran::lower::SymMap &symbolMap = converter.getSymbolMap();
-    if (auto hostDef = symbolMap.lookupVariableDefinition(symbol)) {
-      // Clone the host declare with cache result as input
-      // The first operand is the memref/base for both hlfir::DeclareOp and
-      // fir::DeclareOp
-      mlir::Operation *hostDefOp = (*hostDef).getOperation();
-      mlir::IRMapping mapper;
-      mapper.map(hostDefOp->getOperand(0), cacheOp.getAccVar());
-      mlir::Operation *newDef = builder.clone(*hostDefOp, mapper);
-      symbolMap.addVariableDefinition(
-          symbol, llvm::cast<fir::FortranVariableOpInterface>(newDef));
-    }
+    std::optional<fir::FortranVariableOpInterface> hostDef =
+        symbolMap.lookupVariableDefinition(symbol);
+    assert(hostDef.has_value() && llvm::isa<hlfir::DeclareOp>(*hostDef) &&
+           "expected symbol to be mapped to hlfir.declare");
+    auto hostDeclare = llvm::cast<hlfir::DeclareOp>(*hostDef);
+    // Clone the host declare with cache result as input.
+    mlir::IRMapping mapper;
+    mapper.map(hostDeclare.getMemref(), cacheOp.getAccVar());
+    mlir::Operation *newDef =
+        builder.clone(*hostDeclare.getOperation(), mapper);
+    symbolMap.addVariableDefinition(
+        symbol, llvm::cast<fir::FortranVariableOpInterface>(newDef));
   }
 }
 
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index 67af65ee10cfa..10a36f22d1b14 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -280,3 +280,30 @@ subroutine test_cache_mixed_bounds()
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
 ! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_mixed_boundsEb"}
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_nonunit_lb()
+! Test cache with array that has non-1 lower bound: arr(10:20), cache(arr(15))
+subroutine test_cache_nonunit_lb()
+  integer :: arr(10:20)
+  integer :: i
+
+  !$acc loop
+  do i = 10, 20
+    !$acc cache(arr(15))
+    arr(i) = i
+  end do
+
+! For arr(10:20), startIdx = 10, element 15 has lowerbound = 15 - 10 = 5
+! CHECK: %[[C10:.*]] = arith.constant 10 : index
+! CHECK: acc.loop
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! CHECK: %[[C15:.*]] = arith.constant 15 : index
+! Compute lowerbound = 15 - startIdx = 15 - 10 = 5
+! CHECK: %[[LB:.*]] = arith.subi %[[C15]], %[[C10]] : index
+! Single element has extent = 1
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C10]] : index)
+! The varPtr uses the ref type (second result of hlfir.declare with shapeshift)
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<11xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<11xi32>> {{{.*}}name = "arr
+! The cloned declare produces a box and ref pair
+! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_nonunit_lbEarr"} : (!fir.ref<!fir.array<11xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<11xi32>>, !fir.ref<!fir.array<11xi32>>)
+end subroutine

>From cc2dd831bf01acf44610a138023fa5048468aa4b Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 8 Jan 2026 09:21:28 -0800
Subject: [PATCH 09/21] Bypass the hlfir.declare generation after acc.cache

---
 flang/lib/Lower/OpenACC.cpp            | 50 ++++++++------------------
 flang/test/Lower/OpenACC/acc-cache.f90 | 18 +++-------
 2 files changed, 18 insertions(+), 50 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index b307a0f2ce932..52cd1c34fce6c 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4967,13 +4967,6 @@ static void
 genACC(Fortran::lower::AbstractConverter &converter,
        Fortran::semantics::SemanticsContext &semanticsContext,
        const Fortran::parser::OpenACCCacheConstruct &cacheConstruct) {
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-
-  // Verify we're inside an acc.loop region.
-  auto loopOp = builder.getRegion().getParentOfType<mlir::acc::LoopOp>();
-  if (!loopOp)
-    return;
-
   const auto &objectListWithModifier =
       std::get<Fortran::parser::AccObjectListWithModifier>(cacheConstruct.t);
   const auto &accObjectList =
@@ -4996,42 +4989,27 @@ genACC(Fortran::lower::AbstractConverter &converter,
     std::stringstream asFortran;
     asFortran << symbol.name().ToString();
 
-    fir::factory::AddrAndBoundsInfo info = getDataOperandBaseAddr(
-        converter, builder, symbol, operandLocation, /*unwrapFirBox=*/true);
-
-    // For arrays with non-1 lower bounds, info.addr is a box type.
-    // Use rawInput (the underlying ref) when addr and rawInput have different
-    // element types, similar to how other data clauses handle this case.
-    bool useRawInput =
-        info.rawInput && fir::unwrapRefType(info.addr.getType()) !=
-                             fir::unwrapRefType(info.rawInput.getType());
-    mlir::Value baseAddr = useRawInput ? info.rawInput : info.addr;
-
     llvm::SmallVector<mlir::Value> bounds;
     genCacheBounds(converter, semanticsContext, stmtCtx, accObject, asFortran,
                    bounds);
 
+    std::optional<fir::FortranVariableOpInterface> varDef =
+        converter.getSymbolMap().lookupVariableDefinition(symbol);
+    assert(varDef.has_value() && llvm::isa<hlfir::DeclareOp>(*varDef) &&
+           "expected symbol to be mapped to hlfir.declare");
+    mlir::Value base = varDef->getBase();
+
+    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
     mlir::acc::CacheOp cacheOp = createDataEntryOp<mlir::acc::CacheOp>(
-        builder, operandLocation, baseAddr, asFortran, bounds,
-        /*structured=*/false, /*implicit=*/false, dataClause,
-        baseAddr.getType(),
+        builder, operandLocation, base, asFortran, bounds,
+        /*structured=*/false, /*implicit=*/false, dataClause, base.getType(),
         /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{},
-        /*unwrapBoxAddr=*/true, /*isPresent=*/mlir::Value{});
+        /*unwrapBoxAddr=*/false, /*isPresent=*/mlir::Value{});
 
-    // Update symbol map so future lowering uses the cache result.
-    Fortran::lower::SymMap &symbolMap = converter.getSymbolMap();
-    std::optional<fir::FortranVariableOpInterface> hostDef =
-        symbolMap.lookupVariableDefinition(symbol);
-    assert(hostDef.has_value() && llvm::isa<hlfir::DeclareOp>(*hostDef) &&
-           "expected symbol to be mapped to hlfir.declare");
-    auto hostDeclare = llvm::cast<hlfir::DeclareOp>(*hostDef);
-    // Clone the host declare with cache result as input.
-    mlir::IRMapping mapper;
-    mapper.map(hostDeclare.getMemref(), cacheOp.getAccVar());
-    mlir::Operation *newDef =
-        builder.clone(*hostDeclare.getOperation(), mapper);
-    symbolMap.addVariableDefinition(
-        symbol, llvm::cast<fir::FortranVariableOpInterface>(newDef));
+    // Use acc.cache directly as the variable definition.
+    converter.getSymbolMap().addVariableDefinition(
+        symbol, mlir::cast<fir::FortranVariableOpInterface>(
+                    cacheOp.getOperation()));
   }
 }
 
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index 10a36f22d1b14..eecc990ae150a 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -18,7 +18,7 @@ subroutine test_cache_basic()
 
 ! CHECK: acc.loop
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b"
-! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_basicEb"}
+! CHECK: hlfir.designate %[[CACHE]]
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_readonly()
@@ -35,7 +35,7 @@ subroutine test_cache_readonly()
 
 ! CHECK: acc.loop
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_cache_readonly>, name = "b"
-! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_readonlyEb"}
+! CHECK: hlfir.designate %[[CACHE]]
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_array_section()
@@ -61,7 +61,6 @@ subroutine test_cache_array_section()
 ! CHECK: %[[EXT:.*]] = arith.addi %[[TMP2]], %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
-! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_array_sectionEb"}
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_multiple()
@@ -78,9 +77,7 @@ subroutine test_cache_multiple()
 
 ! CHECK: acc.loop
 ! CHECK: %[[CACHE_B:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b"
-! CHECK: hlfir.declare %[[CACHE_B]](%{{.*}}) {uniq_name = "_QFtest_cache_multipleEb"}
 ! CHECK: %[[CACHE_C:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "c"
-! CHECK: hlfir.declare %[[CACHE_C]](%{{.*}}) {uniq_name = "_QFtest_cache_multipleEc"}
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_2d_array()
@@ -114,7 +111,6 @@ subroutine test_cache_2d_array()
 ! CHECK: arith.addi
 ! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
-! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_arrayEb"}
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_loop_var()
@@ -155,7 +151,6 @@ subroutine test_cache_loop_var()
 ! CHECK: %[[EXT:.*]] = arith.addi %[[DIFF]], %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
-! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_loop_varEb"}
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_2d_loop_vars()
@@ -213,7 +208,6 @@ subroutine test_cache_2d_loop_vars()
 ! CHECK: %[[EXT2:.*]] = arith.addi %[[DIFF2]], %[[C1]] : index
 ! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%[[EXT2]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
-! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_loop_varsEb"}
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_single_element()
@@ -242,7 +236,6 @@ subroutine test_cache_single_element()
 ! CHECK: %[[LB:.*]] = arith.subi %[[I_IDX]], %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
-! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_single_elementEb"}
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_mixed_bounds()
@@ -278,7 +271,6 @@ subroutine test_cache_mixed_bounds()
 ! CHECK: %[[EXT:.*]] = arith.addi %[[DIFF]], %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
-! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_mixed_boundsEb"}
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_nonunit_lb()
@@ -302,8 +294,6 @@ subroutine test_cache_nonunit_lb()
 ! CHECK: %[[LB:.*]] = arith.subi %[[C15]], %[[C10]] : index
 ! Single element has extent = 1
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C10]] : index)
-! The varPtr uses the ref type (second result of hlfir.declare with shapeshift)
-! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<11xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<11xi32>> {{{.*}}name = "arr
-! The cloned declare produces a box and ref pair
-! CHECK: hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_nonunit_lbEarr"} : (!fir.ref<!fir.array<11xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<11xi32>>, !fir.ref<!fir.array<11xi32>>)
+! For non-unit lower bound arrays, acc.cache uses the box type from hlfir.declare
+! CHECK: %[[CACHE:.*]] = acc.cache var(%{{.*}} : !fir.box<!fir.array<11xi32>>) bounds(%[[BOUND]]) -> !fir.box<!fir.array<11xi32>> {{{.*}}name = "arr
 end subroutine

>From c2212eb47bd490ec0ed5c0c387e3a93f6d46237b Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 8 Jan 2026 09:25:25 -0800
Subject: [PATCH 10/21] format

---
 flang/lib/Lower/OpenACC.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 52cd1c34fce6c..09bea1f353a27 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -5008,8 +5008,8 @@ genACC(Fortran::lower::AbstractConverter &converter,
 
     // Use acc.cache directly as the variable definition.
     converter.getSymbolMap().addVariableDefinition(
-        symbol, mlir::cast<fir::FortranVariableOpInterface>(
-                    cacheOp.getOperation()));
+        symbol,
+        mlir::cast<fir::FortranVariableOpInterface>(cacheOp.getOperation()));
   }
 }
 

>From 2f0759c42b6a6ca20be39b1beeb836d2d3960f95 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 8 Jan 2026 10:52:52 -0800
Subject: [PATCH 11/21] Use DataClauseModifier instead of
 DataClause::acc_cache_readonly

---
 flang/lib/Lower/OpenACC.cpp            | 20 ++++++++++++--------
 flang/test/Lower/OpenACC/acc-cache.f90 |  2 +-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 09bea1f353a27..a48537b644364 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -126,7 +126,9 @@ createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
                   mlir::Type retTy, llvm::ArrayRef<mlir::Value> async,
                   llvm::ArrayRef<mlir::Attribute> asyncDeviceTypes,
                   llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes,
-                  bool unwrapBoxAddr = false, mlir::Value isPresent = {}) {
+                  bool unwrapBoxAddr = false, mlir::Value isPresent = {},
+                  mlir::acc::DataClauseModifier modifiers =
+                      mlir::acc::DataClauseModifier::none) {
   mlir::Value varPtrPtr;
   llvm::SmallVector<mlir::Value, 8> operands;
   llvm::SmallVector<int32_t, 8> operandSegments;
@@ -156,6 +158,7 @@ createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
     op.setAsyncOperandsDeviceTypeAttr(builder.getArrayAttr(asyncDeviceTypes));
   if (!asyncOnlyDeviceTypes.empty())
     op.setAsyncOnlyAttr(builder.getArrayAttr(asyncOnlyDeviceTypes));
+  op.setModifiers(modifiers);
   return op;
 }
 
@@ -4974,11 +4977,9 @@ genACC(Fortran::lower::AbstractConverter &converter,
   const auto &modifier =
       std::get<std::optional<Fortran::parser::AccDataModifier>>(
           objectListWithModifier.t);
-  mlir::acc::DataClause dataClause =
-      (modifier &&
-       (*modifier).v == Fortran::parser::AccDataModifier::Modifier::ReadOnly)
-          ? mlir::acc::DataClause::acc_cache_readonly
-          : mlir::acc::DataClause::acc_cache;
+  bool isReadonly =
+      modifier &&
+      (*modifier).v == Fortran::parser::AccDataModifier::Modifier::ReadOnly;
 
   Fortran::lower::StatementContext stmtCtx;
 
@@ -5002,9 +5003,12 @@ genACC(Fortran::lower::AbstractConverter &converter,
     fir::FirOpBuilder &builder = converter.getFirOpBuilder();
     mlir::acc::CacheOp cacheOp = createDataEntryOp<mlir::acc::CacheOp>(
         builder, operandLocation, base, asFortran, bounds,
-        /*structured=*/false, /*implicit=*/false, dataClause, base.getType(),
+        /*structured=*/false, /*implicit=*/false,
+        mlir::acc::DataClause::acc_cache, base.getType(),
         /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{},
-        /*unwrapBoxAddr=*/false, /*isPresent=*/mlir::Value{});
+        /*unwrapBoxAddr=*/false, /*isPresent=*/mlir::Value{},
+        isReadonly ? mlir::acc::DataClauseModifier::readonly
+                   : mlir::acc::DataClauseModifier::none);
 
     // Use acc.cache directly as the variable definition.
     converter.getSymbolMap().addVariableDefinition(
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index eecc990ae150a..0428744cdb999 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -34,7 +34,7 @@ subroutine test_cache_readonly()
   end do
 
 ! CHECK: acc.loop
-! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_cache_readonly>, name = "b"
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {modifiers = #acc<data_clause_modifier readonly>, name = "b"
 ! CHECK: hlfir.designate %[[CACHE]]
 end subroutine
 

>From 2ebf2cbdaf212c186cad14bb9603bd992dc41550 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 8 Jan 2026 12:23:41 -0800
Subject: [PATCH 12/21] Revert the FortranVariableOpInterface setting for
 acc.cache

---
 flang/lib/Lower/OpenACC.cpp            | 7 +++----
 flang/test/Lower/OpenACC/acc-cache.f90 | 6 ++++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index a48537b644364..5b4512a60fa9b 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -5010,10 +5010,9 @@ genACC(Fortran::lower::AbstractConverter &converter,
         isReadonly ? mlir::acc::DataClauseModifier::readonly
                    : mlir::acc::DataClauseModifier::none);
 
-    // Use acc.cache directly as the variable definition.
-    converter.getSymbolMap().addVariableDefinition(
-        symbol,
-        mlir::cast<fir::FortranVariableOpInterface>(cacheOp.getOperation()));
+    fir::ExtendedValue hostExv = converter.getSymbolExtendedValue(symbol);
+    fir::ExtendedValue cacheExv = fir::substBase(hostExv, cacheOp.getAccVar());
+    converter.bindSymbol(symbol, cacheExv);
   }
 }
 
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index 0428744cdb999..f5072c5291a7f 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -18,7 +18,8 @@ subroutine test_cache_basic()
 
 ! CHECK: acc.loop
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b"
-! CHECK: hlfir.designate %[[CACHE]]
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_basicEb"}
+! CHECK: hlfir.designate %[[DECL]]#0
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_readonly()
@@ -35,7 +36,8 @@ subroutine test_cache_readonly()
 
 ! CHECK: acc.loop
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {modifiers = #acc<data_clause_modifier readonly>, name = "b"
-! CHECK: hlfir.designate %[[CACHE]]
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_readonlyEb"}
+! CHECK: hlfir.designate %[[DECL]]#0
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_array_section()

>From ba7480053ebde2a4c59b5b4f0d6c174a7da15cfc Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 8 Jan 2026 12:49:34 -0800
Subject: [PATCH 13/21] Rmove braces

---
 flang/lib/Lower/OpenACC.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 5b4512a60fa9b..874c411fb7d62 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4911,32 +4911,29 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
       Fortran::semantics::MaybeExpr upperExpr =
           Fortran::evaluate::AsGenericExpr(triplet->upper());
 
-      if (!lowerExpr && !upperExpr) {
+      if (!lowerExpr && !upperExpr)
         llvm::report_fatal_error("OpenACC cache directive requires at least "
                                  "one bound to be specified for array section");
-      }
 
       // OpenACC cache only supports unit stride (default or explicit 1).
       auto strideVal = Fortran::evaluate::ToInt64(triplet->stride());
-      if (!strideVal || *strideVal != 1) {
+      if (!strideVal || *strideVal != 1)
         llvm::report_fatal_error("OpenACC cache directive does not support "
                                  "strided array sections");
-      }
 
       // Compute lower bound (use array lb if not specified).
       mlir::Value lb = lowerExpr ? genIndex(lowerExpr) : arrayLb;
 
       // Compute upper bound (use array ub if not specified).
       mlir::Value ub;
-      if (upperExpr) {
+      if (upperExpr)
         ub = genIndex(upperExpr);
-      } else {
+      else
         // arr(lower:) - upper is array's upper bound
         ub = mlir::arith::AddIOp::create(
             builder, loc,
             mlir::arith::SubIOp::create(builder, loc, arrayLb, one),
             arrayExtent);
-      }
 
       // Normalize to zero-based and compute extent.
       lbound = mlir::arith::SubIOp::create(builder, loc, lb, arrayLb);

>From f4c0456d8ed13fd8b15561bec17a2a8bf4554a60 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 8 Jan 2026 13:23:01 -0800
Subject: [PATCH 14/21] Test with unstructured control flow; more checks for
 loop body

---
 flang/test/Lower/OpenACC/acc-cache.f90 | 191 +++++++++++++++++++++++--
 1 file changed, 182 insertions(+), 9 deletions(-)

diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index f5072c5291a7f..e0063a996e1c7 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -19,7 +19,13 @@ subroutine test_cache_basic()
 ! CHECK: acc.loop
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b"
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_basicEb"}
-! CHECK: hlfir.designate %[[DECL]]#0
+! Loop body uses the cached reference
+! CHECK: %[[ELEM:.*]] = hlfir.designate %[[DECL]]#0 (%{{.*}}) : (!fir.ref<!fir.array<10xf32>>, i64) -> !fir.ref<f32>
+! CHECK: %[[LOAD:.*]] = fir.load %[[ELEM]] : !fir.ref<f32>
+! CHECK: hlfir.assign %[[LOAD]] to %{{.*}} : f32, !fir.ref<f32>
+! Scope termination: acc.yield marks the end of the cache scope
+! CHECK: acc.yield
+! CHECK-NEXT: }
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_readonly()
@@ -37,11 +43,18 @@ subroutine test_cache_readonly()
 ! CHECK: acc.loop
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {modifiers = #acc<data_clause_modifier readonly>, name = "b"
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_readonlyEb"}
-! CHECK: hlfir.designate %[[DECL]]#0
+! Loop body uses the cached readonly reference
+! CHECK: %[[ELEM:.*]] = hlfir.designate %[[DECL]]#0 (%{{.*}}) : (!fir.ref<!fir.array<10xf32>>, i64) -> !fir.ref<f32>
+! CHECK: %[[LOAD:.*]] = fir.load %[[ELEM]] : !fir.ref<f32>
+! CHECK: hlfir.assign %[[LOAD]] to %{{.*}} : f32, !fir.ref<f32>
+! Scope termination
+! CHECK: acc.yield
+! CHECK-NEXT: }
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_array_section()
 ! For b(2:5) with startIdx=1: lowerbound = 2-1 = 1, upperbound = 5-1 = 4, extent = 4
+! This test includes an IF statement to verify cache scope with unstructured control flow
 subroutine test_cache_array_section()
   integer, parameter :: n = 10
   real, dimension(n) :: a, b
@@ -50,7 +63,9 @@ subroutine test_cache_array_section()
   !$acc loop
   do i = 1, n
     !$acc cache(b(2:5))
-    a(i) = b(i)
+    if (i > 2) then
+      a(i) = b(i)
+    end if
   end do
 
 ! CHECK: acc.loop
@@ -63,9 +78,22 @@ subroutine test_cache_array_section()
 ! CHECK: %[[EXT:.*]] = arith.addi %[[TMP2]], %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_array_sectionEb"}
+! Unstructured control flow: IF condition generates fir.if
+! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} : i32
+! CHECK: fir.if %[[CMP]] {
+! Loop body uses the cached array section inside conditional
+! CHECK:   hlfir.designate %[[DECL]]#0
+! CHECK:   fir.load
+! CHECK:   hlfir.assign
+! CHECK: }
+! Scope termination: acc.yield terminates the cache scope for all control flow paths
+! CHECK: acc.yield
+! CHECK-NEXT: }
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_multiple()
+! This test includes IF-ELSE to verify cache scope with multiple control flow paths
 subroutine test_cache_multiple()
   integer, parameter :: n = 10
   real, dimension(n) :: a, b, c
@@ -74,12 +102,40 @@ subroutine test_cache_multiple()
   !$acc loop
   do i = 1, n
     !$acc cache(b, c)
-    a(i) = b(i) + c(i)
+    if (i < 5) then
+      a(i) = b(i) + c(i)
+    else
+      a(i) = b(i) - c(i)
+    end if
   end do
 
 ! CHECK: acc.loop
 ! CHECK: %[[CACHE_B:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b"
+! CHECK: %[[DECL_B:.*]]:2 = hlfir.declare %[[CACHE_B]](%{{.*}}) {uniq_name = "_QFtest_cache_multipleEb"}
 ! CHECK: %[[CACHE_C:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "c"
+! CHECK: %[[DECL_C:.*]]:2 = hlfir.declare %[[CACHE_C]](%{{.*}}) {uniq_name = "_QFtest_cache_multipleEc"}
+! Unstructured control flow: IF-ELSE generates fir.if with else region
+! CHECK: %[[CMP:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} : i32
+! CHECK: fir.if %[[CMP]] {
+! Then branch: uses both cached references with addition
+! CHECK:   hlfir.designate %[[DECL_B]]#0
+! CHECK:   fir.load
+! CHECK:   hlfir.designate %[[DECL_C]]#0
+! CHECK:   fir.load
+! CHECK:   arith.addf
+! CHECK:   hlfir.assign
+! CHECK: } else {
+! Else branch: uses both cached references with subtraction
+! CHECK:   hlfir.designate %[[DECL_B]]#0
+! CHECK:   fir.load
+! CHECK:   hlfir.designate %[[DECL_C]]#0
+! CHECK:   fir.load
+! CHECK:   arith.subf
+! CHECK:   hlfir.assign
+! CHECK: }
+! Scope termination: both IF and ELSE paths use cache, then converge to yield
+! CHECK: acc.yield
+! CHECK-NEXT: }
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_2d_array()
@@ -113,6 +169,15 @@ subroutine test_cache_2d_array()
 ! CHECK: arith.addi
 ! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_arrayEb"}
+! Nested loop uses the cached 2D array
+! CHECK: fir.do_loop
+! CHECK: hlfir.designate %[[DECL]]#0
+! CHECK: fir.load
+! CHECK: hlfir.assign
+! Scope termination for acc.loop
+! CHECK: acc.yield
+! CHECK-NEXT: }
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_loop_var()
@@ -153,6 +218,20 @@ subroutine test_cache_loop_var()
 ! CHECK: %[[EXT:.*]] = arith.addi %[[DIFF]], %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_loop_varEb"}
+! Loop body uses the cached reference for b(i), b(i+1), b(i+2)
+! CHECK: hlfir.designate %[[DECL]]#0
+! CHECK: fir.load
+! CHECK: hlfir.designate %[[DECL]]#0
+! CHECK: fir.load
+! CHECK: arith.addf
+! CHECK: hlfir.designate %[[DECL]]#0
+! CHECK: fir.load
+! CHECK: arith.addf
+! CHECK: hlfir.assign
+! Scope termination
+! CHECK: acc.yield
+! CHECK-NEXT: }
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_2d_loop_vars()
@@ -210,10 +289,24 @@ subroutine test_cache_2d_loop_vars()
 ! CHECK: %[[EXT2:.*]] = arith.addi %[[DIFF2]], %[[C1]] : index
 ! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%[[EXT2]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_loop_varsEb"}
+! Loop body uses the cached 2D reference
+! CHECK: hlfir.designate %[[DECL]]#0
+! CHECK: fir.load
+! CHECK: hlfir.designate %[[DECL]]#0
+! CHECK: fir.load
+! CHECK: arith.addf
+! CHECK: hlfir.assign
+! Inner loop continues within the cache scope
+! CHECK: }
+! Scope termination for acc.loop
+! CHECK: acc.yield
+! CHECK-NEXT: }
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_single_element()
 ! Test cache with single element access: b(i)
+! This test includes an EXIT statement to verify cache scope with early loop exit
 subroutine test_cache_single_element()
   integer, parameter :: n = 10
   real, dimension(n) :: a, b
@@ -223,12 +316,18 @@ subroutine test_cache_single_element()
   do i = 1, n
     !$acc cache(b(i))
     a(i) = b(i)
+    if (a(i) > 100.0) exit
   end do
 
-! CHECK: acc.loop private({{.*}}) control(%[[IV:.*]] : i32) = ({{.*}}) to ({{.*}})
-! The privatized iterator is declared and initialized from the loop control variable
+! Unstructured loop with EXIT: acc.loop becomes unstructured with cf.br/cf.cond_br
+! CHECK: acc.loop private({{.*}}) {
+! The privatized iterator is declared
 ! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_cache_single_elementEi"}
-! CHECK: fir.store %[[IV]] to %[[I_DECL]]#0 : !fir.ref<i32>
+! Loop control is done with cf.br/cf.cond_br in unstructured form
+! CHECK: cf.br ^[[HEADER:.*]]
+! CHECK: ^[[HEADER]]:
+! CHECK: cf.cond_br %{{.*}}, ^[[BODY:.*]], ^[[EXIT:.*]]
+! CHECK: ^[[BODY]]:
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
 ! Load i from the iterator variable and convert to index
 ! CHECK: %[[I_LOAD:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
@@ -238,10 +337,29 @@ subroutine test_cache_single_element()
 ! CHECK: %[[LB:.*]] = arith.subi %[[I_IDX]], %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_single_elementEb"}
+! Loop body uses the cached single element
+! CHECK: hlfir.designate %[[DECL]]#0
+! CHECK: fir.load
+! CHECK: hlfir.assign
+! Unstructured control flow: EXIT generates conditional branch
+! CHECK: %[[CMP:.*]] = arith.cmpf ogt, %{{.*}}, %{{.*}} : f32
+! CHECK: cf.cond_br %[[CMP]], ^[[EXIT_BB:.*]], ^[[CONT_BB:.*]]
+! CHECK: ^[[EXIT_BB]]:
+! Early exit path: branch to acc.yield
+! CHECK: cf.br ^[[YIELD:.*]]
+! CHECK: ^[[CONT_BB]]:
+! Normal path: update iterator and loop back
+! CHECK: cf.br ^[[HEADER]]
+! CHECK: ^[[YIELD]]:
+! Scope termination: acc.yield marks end of cache scope
+! CHECK: acc.yield
+! CHECK-NEXT: } attributes {{{.*}}unstructured}
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_mixed_bounds()
 ! Test cache with mixed constant and variable bounds: b(1:i)
+! This test includes a CYCLE statement to verify cache scope with loop continuation
 subroutine test_cache_mixed_bounds()
   integer, parameter :: n = 10
   real, dimension(n) :: a, b
@@ -250,6 +368,7 @@ subroutine test_cache_mixed_bounds()
   !$acc loop
   do i = 1, n
     !$acc cache(b(1:i))
+    if (mod(i, 2) == 0) cycle
     a(i) = b(i)
   end do
 
@@ -273,10 +392,26 @@ subroutine test_cache_mixed_bounds()
 ! CHECK: %[[EXT:.*]] = arith.addi %[[DIFF]], %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_mixed_boundsEb"}
+! Unstructured control flow: CYCLE generates inverted fir.if (body executes when NOT cycling)
+! CHECK: %[[MOD:.*]] = arith.remsi %{{.*}}, %{{.*}} : i32
+! CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[MOD]], %{{.*}} : i32
+! CHECK: %[[TRUE:.*]] = arith.constant true
+! CHECK: %[[NOT_CYCLE:.*]] = arith.xori %[[CMP]], %[[TRUE]] : i1
+! CHECK: fir.if %[[NOT_CYCLE]] {
+! Loop body uses the cached reference (only executed when not cycling)
+! CHECK:   hlfir.designate %[[DECL]]#0
+! CHECK:   fir.load
+! CHECK:   hlfir.assign
+! CHECK: }
+! Scope termination: acc.yield after the conditional
+! CHECK: acc.yield
+! CHECK-NEXT: }
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_nonunit_lb()
 ! Test cache with array that has non-1 lower bound: arr(10:20), cache(arr(15))
+! This test includes SELECT CASE for multi-way unstructured control flow
 subroutine test_cache_nonunit_lb()
   integer :: arr(10:20)
   integer :: i
@@ -284,12 +419,24 @@ subroutine test_cache_nonunit_lb()
   !$acc loop
   do i = 10, 20
     !$acc cache(arr(15))
-    arr(i) = i
+    select case (mod(i, 3))
+    case (0)
+      arr(i) = i * 2
+    case (1)
+      arr(i) = i * 3
+    case default
+      arr(i) = i
+    end select
   end do
 
 ! For arr(10:20), startIdx = 10, element 15 has lowerbound = 15 - 10 = 5
 ! CHECK: %[[C10:.*]] = arith.constant 10 : index
-! CHECK: acc.loop
+! Unstructured loop with SELECT CASE: acc.loop becomes unstructured
+! CHECK: acc.loop private({{.*}}) {
+! CHECK: cf.br ^[[HEADER:.*]]
+! CHECK: ^[[HEADER]]:
+! CHECK: cf.cond_br %{{.*}}, ^[[BODY:.*]], ^[[EXIT:.*]]
+! CHECK: ^[[BODY]]:
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
 ! CHECK: %[[C15:.*]] = arith.constant 15 : index
 ! Compute lowerbound = 15 - startIdx = 15 - 10 = 5
@@ -298,4 +445,30 @@ subroutine test_cache_nonunit_lb()
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C10]] : index)
 ! For non-unit lower bound arrays, acc.cache uses the box type from hlfir.declare
 ! CHECK: %[[CACHE:.*]] = acc.cache var(%{{.*}} : !fir.box<!fir.array<11xi32>>) bounds(%[[BOUND]]) -> !fir.box<!fir.array<11xi32>> {{{.*}}name = "arr
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_nonunit_lbEarr"}
+! Unstructured control flow: SELECT CASE generates fir.select_case
+! CHECK: %[[MOD:.*]] = arith.remsi %{{.*}}, %{{.*}} : i32
+! CHECK: fir.select_case %[[MOD]] : i32 [#fir.point, %{{.*}}, ^[[CASE0:.*]], #fir.point, %{{.*}}, ^[[CASE1:.*]], unit, ^[[DEFAULT:.*]]]
+! Case 0: i * 2
+! CHECK: ^[[CASE0]]:
+! CHECK: hlfir.designate %[[DECL]]#0
+! CHECK: hlfir.assign
+! CHECK: cf.br ^[[MERGE:.*]]
+! Case 1: i * 3
+! CHECK: ^[[CASE1]]:
+! CHECK: hlfir.designate %[[DECL]]#0
+! CHECK: hlfir.assign
+! CHECK: cf.br ^[[MERGE]]
+! Default case: i
+! CHECK: ^[[DEFAULT]]:
+! CHECK: hlfir.designate %[[DECL]]#0
+! CHECK: hlfir.assign
+! CHECK: cf.br ^[[MERGE]]
+! All SELECT CASE branches converge, then loop back or exit
+! CHECK: ^[[MERGE]]:
+! CHECK: cf.br ^[[HEADER]]
+! CHECK: ^[[EXIT]]:
+! Scope termination: acc.yield marks end of cache scope
+! CHECK: acc.yield
+! CHECK-NEXT: } attributes {{{.*}}unstructured}
 end subroutine

>From fc1b53a73ff5f82fb760cc5bb66a5763f7ea7e36 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 8 Jan 2026 19:39:49 -0800
Subject: [PATCH 15/21] Add more cache semantic checks

---
 flang/lib/Lower/OpenACC.cpp                   | 13 ------
 flang/lib/Semantics/check-acc-structure.cpp   | 45 +++++++++++++++++++
 .../Semantics/OpenACC/acc-cache-validity.f90  | 12 +++++
 3 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 874c411fb7d62..32d97d26e062d 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4903,24 +4903,11 @@ genCacheBounds(Fortran::lower::AbstractConverter &converter,
     if (triplet) {
       asFortran << ':';
 
-      // OpenACC spec requires at least one of lower or upper bound to be
-      // specified: arr(lower:upper), arr(lower:), or arr(:upper).
-      // arr(:) with both bounds missing is not allowed.
       Fortran::semantics::MaybeExpr lowerExpr =
           Fortran::evaluate::AsGenericExpr(triplet->lower());
       Fortran::semantics::MaybeExpr upperExpr =
           Fortran::evaluate::AsGenericExpr(triplet->upper());
 
-      if (!lowerExpr && !upperExpr)
-        llvm::report_fatal_error("OpenACC cache directive requires at least "
-                                 "one bound to be specified for array section");
-
-      // OpenACC cache only supports unit stride (default or explicit 1).
-      auto strideVal = Fortran::evaluate::ToInt64(triplet->stride());
-      if (!strideVal || *strideVal != 1)
-        llvm::report_fatal_error("OpenACC cache directive does not support "
-                                 "strided array sections");
-
       // Compute lower bound (use array lb if not specified).
       mlir::Value lb = lowerExpr ? genIndex(lowerExpr) : arrayLb;
 
diff --git a/flang/lib/Semantics/check-acc-structure.cpp b/flang/lib/Semantics/check-acc-structure.cpp
index 5e87b834edf7e..80812b6fd619d 100644
--- a/flang/lib/Semantics/check-acc-structure.cpp
+++ b/flang/lib/Semantics/check-acc-structure.cpp
@@ -672,6 +672,51 @@ void AccStructureChecker::Enter(const parser::OpenACCCacheConstruct &x) {
     context_.Say(verbatim.source,
           "The CACHE directive must be inside a loop"_err_en_US);
   }
+
+  // Check cache directive array section constraints
+  const auto &objectListWithModifier =
+      std::get<parser::AccObjectListWithModifier>(x.t);
+  const auto &objectList =
+      std::get<parser::AccObjectList>(objectListWithModifier.t);
+
+  for (const auto &accObject : objectList.v) {
+    common::visit(
+        common::visitors{
+            [&](const parser::Designator &designator) {
+              if (const auto *dataRef =
+                      std::get_if<parser::DataRef>(&designator.u)) {
+                if (const auto *arrayElem =
+                        std::get_if<common::Indirection<parser::ArrayElement>>(
+                            &dataRef->u)) {
+                  for (const auto &subscript : arrayElem->value().subscripts) {
+                    if (const auto *triplet =
+                            std::get_if<parser::SubscriptTriplet>(
+                                &subscript.u)) {
+                      const auto &lower{std::get<0>(triplet->t)};
+                      const auto &upper{std::get<1>(triplet->t)};
+                      const auto &stride{std::get<2>(triplet->t)};
+                      if (!lower && !upper) {
+                        context_.Say(designator.source,
+                            "The CACHE directive requires at least one of the bounds in the array section subscript triplet to be specified"_err_en_US);
+                      }
+                      if (stride) {
+                        if (auto strideVal{GetIntValue(*stride)}) {
+                          if (*strideVal != 1) {
+                            context_.Say(designator.source,
+                                "The CACHE directive does not support strided array sections"_err_en_US);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            },
+            [&](const parser::Name &) {
+              // Common block names are not expected in cache directive
+            }},
+        accObject.u);
+  }
 }
 void AccStructureChecker::Leave(const parser::OpenACCCacheConstruct &x) {
   dirContext_.pop_back();
diff --git a/flang/test/Semantics/OpenACC/acc-cache-validity.f90 b/flang/test/Semantics/OpenACC/acc-cache-validity.f90
index 49f400e763bfb..cef8fc041ae32 100644
--- a/flang/test/Semantics/OpenACC/acc-cache-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-cache-validity.f90
@@ -38,6 +38,18 @@ program openacc_cache_validity
   !ERROR: Only array element or subarray are allowed in CACHE directive
   !$acc cache(/i/)
 
+  !ERROR: The CACHE directive requires at least one of the bounds in the array section subscript triplet to be specified
+  !$acc cache(a(:))
+
+  !ERROR: The CACHE directive requires at least one of the bounds in the array section subscript triplet to be specified
+  !$acc cache(aa(:,:))
+
+  !ERROR: The CACHE directive does not support strided array sections
+  !$acc cache(a(1:10:2))
+
+  !ERROR: The CACHE directive does not support strided array sections
+  !$acc cache(aa(1:10:2, 1:5))
+
   end do
 
   !ERROR: The CACHE directive must be inside a loop

>From b97af196632794c46620218cc7ef9de19759c4f1 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 8 Jan 2026 21:42:59 -0800
Subject: [PATCH 16/21] Use gatherDataOperandAddrAndBounds

---
 flang/lib/Lower/OpenACC.cpp            | 126 +++--------------------
 flang/test/Lower/OpenACC/acc-cache.f90 | 134 +++++++++----------------
 2 files changed, 57 insertions(+), 203 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 32d97d26e062d..062366f87eb09 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4840,120 +4840,11 @@ genACC(Fortran::lower::AbstractConverter &converter,
       atomicConstruct.u);
 }
 
-/// Generate acc.bounds for cache directive. Handles:
-/// - Single element: arr(i) or arr(5)
-/// - Full range: arr(lower:upper)
-/// - Missing upper: arr(lower:) - uses array's upper bound
-/// - Missing lower: arr(:upper) - uses array's lower bound
-static void
-genCacheBounds(Fortran::lower::AbstractConverter &converter,
-               Fortran::semantics::SemanticsContext &semanticsContext,
-               Fortran::lower::StatementContext &stmtCtx,
-               const Fortran::parser::AccObject &accObject,
-               std::stringstream &asFortran,
-               llvm::SmallVectorImpl<mlir::Value> &bounds) {
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  mlir::Location loc = converter.getCurrentLocation();
-  mlir::Type idxTy = builder.getIndexType();
-  mlir::Type boundTy = builder.getType<mlir::acc::DataBoundsType>();
-
-  Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
-  Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
-
-  std::optional<Fortran::evaluate::DataRef> dataRef;
-  Fortran::semantics::MaybeExpr designator = Fortran::common::visit(
-      [&](auto &&s) { return ea.Analyze(s); }, accObject.u);
-  if (designator)
-    dataRef = Fortran::evaluate::ExtractDataRef(*designator);
-
-  if (!dataRef)
-    return;
-
-  auto *arrayRef = std::get_if<Fortran::evaluate::ArrayRef>(&dataRef->u);
-  if (!arrayRef)
-    return;
-
-  // Helper to generate index value from expression.
-  // Optimize for compile-time constants to generate index type directly.
-  auto genIndex =
-      [&](const Fortran::semantics::MaybeExpr &expr) -> mlir::Value {
-    if (auto constVal = Fortran::evaluate::ToInt64(*expr))
-      return builder.createIntegerConstant(loc, idxTy, *constVal);
-    return builder.createConvert(
-        loc, idxTy, fir::getBase(converter.genExprValue(loc, *expr, stmtCtx)));
-  };
-
-  const auto &subscripts = arrayRef->subscript();
-  int dimension = 0;
-  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
-  fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(symbol);
-
-  for (const auto &subscript : subscripts) {
-    if (dimension != 0)
-      asFortran << ',';
-
-    mlir::Value lbound, extent;
-    mlir::Value arrayLb =
-        fir::factory::readLowerBound(builder, loc, dataExv, dimension, one);
-    mlir::Value arrayExtent =
-        fir::factory::readExtent(builder, loc, dataExv, dimension);
-
-    const auto *triplet = std::get_if<Fortran::evaluate::Triplet>(&subscript.u);
-
-    if (triplet) {
-      asFortran << ':';
-
-      Fortran::semantics::MaybeExpr lowerExpr =
-          Fortran::evaluate::AsGenericExpr(triplet->lower());
-      Fortran::semantics::MaybeExpr upperExpr =
-          Fortran::evaluate::AsGenericExpr(triplet->upper());
-
-      // Compute lower bound (use array lb if not specified).
-      mlir::Value lb = lowerExpr ? genIndex(lowerExpr) : arrayLb;
-
-      // Compute upper bound (use array ub if not specified).
-      mlir::Value ub;
-      if (upperExpr)
-        ub = genIndex(upperExpr);
-      else
-        // arr(lower:) - upper is array's upper bound
-        ub = mlir::arith::AddIOp::create(
-            builder, loc,
-            mlir::arith::SubIOp::create(builder, loc, arrayLb, one),
-            arrayExtent);
-
-      // Normalize to zero-based and compute extent.
-      lbound = mlir::arith::SubIOp::create(builder, loc, lb, arrayLb);
-      mlir::Value ubound =
-          mlir::arith::SubIOp::create(builder, loc, ub, arrayLb);
-      extent = mlir::arith::AddIOp::create(
-          builder, loc,
-          mlir::arith::SubIOp::create(builder, loc, ubound, lbound), one);
-    } else {
-      // Single element: arr(elem)
-      Fortran::evaluate::Expr<Fortran::evaluate::SubscriptInteger> scalarExpr =
-          std::get<Fortran::evaluate::IndirectSubscriptIntegerExpr>(subscript.u)
-              .value();
-      Fortran::semantics::MaybeExpr elemExpr =
-          Fortran::evaluate::AsGenericExpr(std::move(scalarExpr));
-      mlir::Value elem = genIndex(elemExpr);
-
-      lbound = mlir::arith::SubIOp::create(builder, loc, elem, arrayLb);
-      extent = one;
-    }
-
-    mlir::Value bound = mlir::acc::DataBoundsOp::create(
-        builder, loc, boundTy, lbound, /*upperbound=*/mlir::Value{}, extent,
-        /*stride=*/one, /*strideInBytes=*/false, arrayLb);
-    bounds.push_back(bound);
-    ++dimension;
-  }
-}
-
 static void
 genACC(Fortran::lower::AbstractConverter &converter,
        Fortran::semantics::SemanticsContext &semanticsContext,
        const Fortran::parser::OpenACCCacheConstruct &cacheConstruct) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   const auto &objectListWithModifier =
       std::get<Fortran::parser::AccObjectListWithModifier>(cacheConstruct.t);
   const auto &accObjectList =
@@ -4972,11 +4863,19 @@ genACC(Fortran::lower::AbstractConverter &converter,
     Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
 
     std::stringstream asFortran;
-    asFortran << symbol.name().ToString();
+
+    Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
+    Fortran::semantics::MaybeExpr designator = Fortran::common::visit(
+        [&](auto &&s) { return ea.Analyze(s); }, accObject.u);
 
     llvm::SmallVector<mlir::Value> bounds;
-    genCacheBounds(converter, semanticsContext, stmtCtx, accObject, asFortran,
-                   bounds);
+    Fortran::lower::gatherDataOperandAddrAndBounds<mlir::acc::DataBoundsOp,
+                                                   mlir::acc::DataBoundsType>(
+        converter, builder, semanticsContext, stmtCtx, symbol, designator,
+        operandLocation, asFortran, bounds,
+        /*treatIndexAsSection=*/true, /*unwrapFirBox=*/false,
+        /*genDefaultBounds=*/false, /*strideIncludeLowerExtent=*/false,
+        /*loadAllocatableAndPointerComponent=*/false);
 
     std::optional<fir::FortranVariableOpInterface> varDef =
         converter.getSymbolMap().lookupVariableDefinition(symbol);
@@ -4984,7 +4883,6 @@ genACC(Fortran::lower::AbstractConverter &converter,
            "expected symbol to be mapped to hlfir.declare");
     mlir::Value base = varDef->getBase();
 
-    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
     mlir::acc::CacheOp cacheOp = createDataEntryOp<mlir::acc::CacheOp>(
         builder, operandLocation, base, asFortran, bounds,
         /*structured=*/false, /*implicit=*/false,
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index e0063a996e1c7..d22e12cdf96a8 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -69,14 +69,11 @@ subroutine test_cache_array_section()
   end do
 
 ! CHECK: acc.loop
+! For b(2:5) with startIdx=1: lowerbound = 2-1 = 1, upperbound = 5-1 = 4
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
-! CHECK: %[[C2:.*]] = arith.constant 2 : index
-! CHECK: %[[C5:.*]] = arith.constant 5 : index
-! CHECK: %[[LB:.*]] = arith.subi %[[C2]], %[[C1]] : index
-! CHECK: %[[TMP1:.*]] = arith.subi %[[C5]], %[[C1]] : index
-! CHECK: %[[TMP2:.*]] = arith.subi %[[TMP1]], %[[LB]] : index
-! CHECK: %[[EXT:.*]] = arith.addi %[[TMP2]], %[[C1]] : index
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! CHECK: %[[LB:.*]] = arith.constant 1 : index
+! CHECK: %[[UB:.*]] = arith.constant 4 : index
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%{{.*}} : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_array_sectionEb"}
 ! Unstructured control flow: IF condition generates fir.if
@@ -154,20 +151,15 @@ subroutine test_cache_2d_array()
   end do
 
 ! CHECK: acc.loop
-! CHECK-DAG: arith.constant 1 : index
-! CHECK-DAG: arith.constant 5 : index
-! Dimension 1: lowerbound = 1 - startIdx = 0, upperbound = 5 - startIdx = 4, extent = 5
-! CHECK: %[[LB1:.*]] = arith.subi %{{.*}}, %{{.*}} : index
-! CHECK: arith.subi
-! CHECK: arith.subi
-! CHECK: arith.addi
-! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! Dimension 2: lowerbound = 1 - startIdx = 0, upperbound = 5 - startIdx = 4, extent = 5
-! CHECK: %[[LB2:.*]] = arith.subi %{{.*}}, %{{.*}} : index
-! CHECK: arith.subi
-! CHECK: arith.subi
-! CHECK: arith.addi
-! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! Dimension 1: lowerbound = 1 - startIdx = 0, upperbound = 5 - startIdx = 4
+! CHECK: %[[C1:.*]] = arith.constant 1 : index
+! CHECK: %[[C0:.*]] = arith.constant 0 : index
+! CHECK: %[[C4:.*]] = arith.constant 4 : index
+! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[C0]] : index) upperbound(%[[C4]] : index) extent(%{{.*}} : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! Dimension 2: lowerbound = 0, upperbound = 4
+! CHECK: %[[C0_2:.*]] = arith.constant 0 : index
+! CHECK: %[[C4_2:.*]] = arith.constant 4 : index
+! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[C0_2]] : index) upperbound(%[[C4_2]] : index) extent(%{{.*}} : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_arrayEb"}
 ! Nested loop uses the cached 2D array
@@ -197,26 +189,22 @@ subroutine test_cache_loop_var()
 ! The privatized iterator is declared and initialized from the loop control variable
 ! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_cache_loop_varEi"}
 ! CHECK: fir.store %[[IV]] to %[[I_DECL]]#0 : !fir.ref<i32>
-! Bounds generation loads the iterator and converts it to index
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
-! Load i for lower bound (i)
+! Load iterator i for lowerbound computation
 ! CHECK: %[[I_LOAD1:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
-! CHECK: %[[I_I64_1:.*]] = fir.convert %[[I_LOAD1]] : (i32) -> i64
-! CHECK: %[[I_IDX_1:.*]] = fir.convert %[[I_I64_1]] : (i64) -> index
-! Load i for upper bound (i+2)
+! CHECK: %[[I_CVT1:.*]] = fir.convert %[[I_LOAD1]] : (i32) -> i64
+! CHECK: %[[I_IDX1:.*]] = fir.convert %[[I_CVT1]] : (i64) -> index
+! Compute lowerbound = i - 1
+! CHECK: %[[LB:.*]] = arith.subi %[[I_IDX1]], %[[C1]] : index
+! Load iterator i again for upperbound computation (i+2)
 ! CHECK: %[[I_LOAD2:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
-! CHECK: %[[C2_I32:.*]] = arith.constant 2 : i32
-! CHECK: %[[I_PLUS_2:.*]] = arith.addi %[[I_LOAD2]], %[[C2_I32]] : i32
-! CHECK: %[[UB_I64:.*]] = fir.convert %[[I_PLUS_2]] : (i32) -> i64
-! CHECK: %[[UB_IDX:.*]] = fir.convert %[[UB_I64]] : (i64) -> index
-! Compute lowerbound = i - startIdx (offset from startIdx)
-! CHECK: %[[LB:.*]] = arith.subi %[[I_IDX_1]], %[[C1]] : index
-! Compute upperbound = (i+2) - startIdx (offset from startIdx)
+! CHECK: %[[C2:.*]] = arith.constant 2 : i32
+! CHECK: %[[I_PLUS_2:.*]] = arith.addi %[[I_LOAD2]], %[[C2]] : i32
+! CHECK: %[[UB_CVT:.*]] = fir.convert %[[I_PLUS_2]] : (i32) -> i64
+! CHECK: %[[UB_IDX:.*]] = fir.convert %[[UB_CVT]] : (i64) -> index
+! Compute upperbound = (i+2) - 1
 ! CHECK: %[[UB:.*]] = arith.subi %[[UB_IDX]], %[[C1]] : index
-! Compute extent = ub - lb + 1
-! CHECK: %[[DIFF:.*]] = arith.subi %[[UB]], %[[LB]] : index
-! CHECK: %[[EXT:.*]] = arith.addi %[[DIFF]], %[[C1]] : index
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%{{.*}} : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_loop_varEb"}
 ! Loop body uses the cached reference for b(i), b(i+1), b(i+2)
@@ -257,37 +245,10 @@ subroutine test_cache_2d_loop_vars()
 ! CHECK: fir.do_loop %[[J_IV:.*]] = {{.*}} iter_args(%[[J_ITER:.*]] = {{.*}})
 ! Inner loop iterator j is stored to j variable
 ! CHECK: fir.store %[[J_ITER]] to %[[J_REF:.*]] : !fir.ref<i32>
-! CHECK: %[[C1:.*]] = arith.constant 1 : index
-! Dimension 1 bounds from j: load j, convert, compute j+1
-! CHECK: %[[J_LOAD1:.*]] = fir.load %[[J_REF]] : !fir.ref<i32>
-! CHECK: %[[J_I64_1:.*]] = fir.convert %[[J_LOAD1]] : (i32) -> i64
-! CHECK: %[[J_IDX_1:.*]] = fir.convert %[[J_I64_1]] : (i64) -> index
-! CHECK: %[[J_LOAD2:.*]] = fir.load %[[J_REF]] : !fir.ref<i32>
-! CHECK: %[[C1_I32_J:.*]] = arith.constant 1 : i32
-! CHECK: %[[J_PLUS_1:.*]] = arith.addi %[[J_LOAD2]], %[[C1_I32_J]] : i32
-! CHECK: %[[J_PLUS_1_I64:.*]] = fir.convert %[[J_PLUS_1]] : (i32) -> i64
-! CHECK: %[[J_PLUS_1_IDX:.*]] = fir.convert %[[J_PLUS_1_I64]] : (i64) -> index
-! Compute lowerbound = j - 1, upperbound = (j+1) - 1, extent = 2
-! CHECK: %[[LB1:.*]] = arith.subi %[[J_IDX_1]], %[[C1]] : index
-! CHECK: %[[UB1:.*]] = arith.subi %[[J_PLUS_1_IDX]], %[[C1]] : index
-! CHECK: %[[DIFF1:.*]] = arith.subi %[[UB1]], %[[LB1]] : index
-! CHECK: %[[EXT1:.*]] = arith.addi %[[DIFF1]], %[[C1]] : index
-! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB1]] : index) extent(%[[EXT1]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
-! Dimension 2 bounds from i (outer loop): load i, convert, compute i+1
-! CHECK: %[[I_LOAD1:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
-! CHECK: %[[I_I64_1:.*]] = fir.convert %[[I_LOAD1]] : (i32) -> i64
-! CHECK: %[[I_IDX_1:.*]] = fir.convert %[[I_I64_1]] : (i64) -> index
-! CHECK: %[[I_LOAD2:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
-! CHECK: %[[C1_I32_I:.*]] = arith.constant 1 : i32
-! CHECK: %[[I_PLUS_1:.*]] = arith.addi %[[I_LOAD2]], %[[C1_I32_I]] : i32
-! CHECK: %[[I_PLUS_1_I64:.*]] = fir.convert %[[I_PLUS_1]] : (i32) -> i64
-! CHECK: %[[I_PLUS_1_IDX:.*]] = fir.convert %[[I_PLUS_1_I64]] : (i64) -> index
-! Compute lowerbound = i - 1, upperbound = (i+1) - 1, extent = 2
-! CHECK: %[[LB2:.*]] = arith.subi %[[I_IDX_1]], %[[C1]] : index
-! CHECK: %[[UB2:.*]] = arith.subi %[[I_PLUS_1_IDX]], %[[C1]] : index
-! CHECK: %[[DIFF2:.*]] = arith.subi %[[UB2]], %[[LB2]] : index
-! CHECK: %[[EXT2:.*]] = arith.addi %[[DIFF2]], %[[C1]] : index
-! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB2]] : index) extent(%[[EXT2]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! Dimension 1 bounds from j: lowerbound = j-1, upperbound = j
+! CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
+! Dimension 2 bounds from i: lowerbound = i-1, upperbound = i
+! CHECK: %[[BOUND2:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND1]], %[[BOUND2]]) -> !fir.ref<!fir.array<10x10xf32>> {{{.*}}name = "b
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_2d_loop_varsEb"}
 ! Loop body uses the cached 2D reference
@@ -329,13 +290,13 @@ subroutine test_cache_single_element()
 ! CHECK: cf.cond_br %{{.*}}, ^[[BODY:.*]], ^[[EXIT:.*]]
 ! CHECK: ^[[BODY]]:
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
-! Load i from the iterator variable and convert to index
+! Load iterator i for bounds computation
 ! CHECK: %[[I_LOAD:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
-! CHECK: %[[I_I64:.*]] = fir.convert %[[I_LOAD]] : (i32) -> i64
-! CHECK: %[[I_IDX:.*]] = fir.convert %[[I_I64]] : (i64) -> index
-! Compute lowerbound = i - startIdx (offset from startIdx), extent = 1 for single element
+! CHECK: %[[I_CVT1:.*]] = fir.convert %[[I_LOAD]] : (i32) -> i64
+! CHECK: %[[I_IDX:.*]] = fir.convert %[[I_CVT1]] : (i64) -> index
+! Compute lowerbound = i - 1 (single element: upperbound = lowerbound)
 ! CHECK: %[[LB:.*]] = arith.subi %[[I_IDX]], %[[C1]] : index
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_single_elementEb"}
 ! Loop body uses the cached single element
@@ -376,21 +337,16 @@ subroutine test_cache_mixed_bounds()
 ! The privatized iterator is declared and initialized
 ! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_cache_mixed_boundsEi"}
 ! CHECK: fir.store %[[IV]] to %[[I_DECL]]#0 : !fir.ref<i32>
+! b(1:i): lower bound is constant 0 (1-1), upper bound is i-1
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
-! b(1:i): lower bound is constant 1
-! CHECK: %[[C1_LB:.*]] = arith.constant 1 : index
-! Upper bound i is loaded from iterator variable
+! CHECK: %[[C0:.*]] = arith.constant 0 : index
+! Load iterator i for upperbound computation
 ! CHECK: %[[I_LOAD:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
-! CHECK: %[[I_I64:.*]] = fir.convert %[[I_LOAD]] : (i32) -> i64
-! CHECK: %[[I_IDX:.*]] = fir.convert %[[I_I64]] : (i64) -> index
-! Compute lowerbound = 1 - startIdx = 0 (constant offset)
-! CHECK: %[[LB:.*]] = arith.subi %[[C1_LB]], %[[C1]] : index
-! Compute upperbound = i - startIdx (offset from startIdx, uses iterator)
+! CHECK: %[[I_CVT:.*]] = fir.convert %[[I_LOAD]] : (i32) -> i64
+! CHECK: %[[I_IDX:.*]] = fir.convert %[[I_CVT]] : (i64) -> index
+! Compute upperbound = i - 1
 ! CHECK: %[[UB:.*]] = arith.subi %[[I_IDX]], %[[C1]] : index
-! Compute extent = ub - lb + 1 = i (dynamic based on iterator)
-! CHECK: %[[DIFF:.*]] = arith.subi %[[UB]], %[[LB]] : index
-! CHECK: %[[EXT:.*]] = arith.addi %[[DIFF]], %[[C1]] : index
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[EXT]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[C0]] : index) upperbound(%[[UB]] : index) extent(%{{.*}} : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {{{.*}}name = "b
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_mixed_boundsEb"}
 ! Unstructured control flow: CYCLE generates inverted fir.if (body executes when NOT cycling)
@@ -437,12 +393,12 @@ subroutine test_cache_nonunit_lb()
 ! CHECK: ^[[HEADER]]:
 ! CHECK: cf.cond_br %{{.*}}, ^[[BODY:.*]], ^[[EXIT:.*]]
 ! CHECK: ^[[BODY]]:
+! Compute lowerbound = 15 - startIdx = 15 - 10 = 5
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
 ! CHECK: %[[C15:.*]] = arith.constant 15 : index
-! Compute lowerbound = 15 - startIdx = 15 - 10 = 5
-! CHECK: %[[LB:.*]] = arith.subi %[[C15]], %[[C10]] : index
-! Single element has extent = 1
-! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%[[C1]] : index) startIdx(%[[C10]] : index)
+! CHECK: %[[LB:.*]] = arith.subi %[[C15]], %{{.*}} : index
+! Single element: upperbound equals lowerbound, startIdx = 10
+! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[LB]] : index) extent(%[[C1]] : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index) {strideInBytes = true}
 ! For non-unit lower bound arrays, acc.cache uses the box type from hlfir.declare
 ! CHECK: %[[CACHE:.*]] = acc.cache var(%{{.*}} : !fir.box<!fir.array<11xi32>>) bounds(%[[BOUND]]) -> !fir.box<!fir.array<11xi32>> {{{.*}}name = "arr
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CACHE]](%{{.*}}) {uniq_name = "_QFtest_cache_nonunit_lbEarr"}

>From ce328c04b795f8523e9ac74b22e3c8805830be35 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Thu, 8 Jan 2026 22:17:38 -0800
Subject: [PATCH 17/21] format

---
 flang/lib/Semantics/check-acc-structure.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Semantics/check-acc-structure.cpp b/flang/lib/Semantics/check-acc-structure.cpp
index 80812b6fd619d..519f9b1fcf348 100644
--- a/flang/lib/Semantics/check-acc-structure.cpp
+++ b/flang/lib/Semantics/check-acc-structure.cpp
@@ -669,8 +669,8 @@ void AccStructureChecker::Enter(const parser::OpenACCCacheConstruct &x) {
   PushContextAndClauseSets(verbatim.source, llvm::acc::Directive::ACCD_cache);
   SetContextDirectiveSource(verbatim.source);
   if (loopNestLevel == 0) {
-    context_.Say(verbatim.source,
-          "The CACHE directive must be inside a loop"_err_en_US);
+    context_.Say(
+        verbatim.source, "The CACHE directive must be inside a loop"_err_en_US);
   }
 
   // Check cache directive array section constraints

>From baa35c30cbc5cb3a3e7e8567bc405a195e45dbe5 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Fri, 9 Jan 2026 08:31:07 -0800
Subject: [PATCH 18/21] Add two more tests: 1. uses of the acc cache variables
 after the cache scope, 2. test without any acc loop

---
 flang/test/Lower/OpenACC/acc-cache.f90 | 46 ++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index d22e12cdf96a8..5205bad7589c3 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -428,3 +428,49 @@ subroutine test_cache_nonunit_lb()
 ! CHECK: acc.yield
 ! CHECK-NEXT: } attributes {{{.*}}unstructured}
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_use_after_region()
+subroutine test_cache_use_after_region()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    !$acc cache(b)
+    a(i) = b(i)
+  end do
+
+  ! Use b after the cache region - should use original variable
+  a(1) = b(1) + 1.0
+
+! CHECK: acc.loop
+! CHECK: acc.cache varPtr(%[[B_VAR:.*]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.yield
+! CHECK: }
+! After loop: uses original b, not cached version
+! CHECK: %[[B_ORIG:.*]] = hlfir.designate %{{.*}}#0 (%{{.*}})
+! CHECK: fir.load %[[B_ORIG]]
+! CHECK: arith.addf
+! CHECK: hlfir.assign
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_in_regular_loop()
+subroutine test_cache_in_regular_loop()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  integer :: i
+
+  ! Cache in regular DO loop (not acc loop)
+  do i = 1, n
+    !$acc cache(b(i))
+    a(i) = b(i)
+  end do
+
+! CHECK: fir.do_loop
+! CHECK: acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}})
+! CHECK: hlfir.declare
+! CHECK: hlfir.designate
+! CHECK: fir.load
+! CHECK: hlfir.assign
+end subroutine
\ No newline at end of file

>From e0c2cc8fc4dc1a70c2eebc54eb982861601b4de7 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Fri, 9 Jan 2026 08:46:05 -0800
Subject: [PATCH 19/21] Update the test to check the use of cache

---
 flang/test/Lower/OpenACC/acc-cache.f90 | 48 ++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index 5205bad7589c3..c9dc7ce0f7955 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -430,6 +430,7 @@ subroutine test_cache_nonunit_lb()
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_cache_use_after_region()
+! CHECK: %[[B_VAR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_cache_use_after_regionEb"}
 subroutine test_cache_use_after_region()
   integer, parameter :: n = 10
   real, dimension(n) :: a, b
@@ -445,17 +446,52 @@ subroutine test_cache_use_after_region()
   a(1) = b(1) + 1.0
 
 ! CHECK: acc.loop
-! CHECK: acc.cache varPtr(%[[B_VAR:.*]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.cache varPtr(%[[B_VAR]]#0 : !fir.ref<!fir.array<10xf32>>)
 ! CHECK: acc.yield
 ! CHECK: }
 ! After loop: uses original b, not cached version
-! CHECK: %[[B_ORIG:.*]] = hlfir.designate %{{.*}}#0 (%{{.*}})
+! CHECK: %[[B_ORIG:.*]] = hlfir.designate %[[B_VAR]]#0 (%{{.*}})
 ! CHECK: fir.load %[[B_ORIG]]
 ! CHECK: arith.addf
 ! CHECK: hlfir.assign
 end subroutine
 
+
+! CHECK-LABEL: func.func @_QPtest_cache_nested_scope()
+! CHECK: %[[B_VAR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_cache_nested_scopeEb"}
+subroutine test_cache_nested_scope()
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b, c
+  integer :: i, j
+
+  !$acc loop
+  do i = 1, n
+    !$acc loop
+    do j = 1, n
+      !$acc cache(b(j))
+      c(j) = b(j)
+    end do
+    ! Use b(i) in outer loop - should use original, not inner cache
+    a(i) = b(i)
+  end do
+
+! Outer acc.loop
+! CHECK: acc.loop
+! Inner acc.loop with cache
+! CHECK: acc.loop
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%[[B_VAR]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}})
+! CHECK: %[[CACHE_DECL:.*]]:2 = hlfir.declare %[[CACHE]]
+! CHECK: hlfir.designate %[[CACHE_DECL]]#0
+! CHECK: acc.yield
+! CHECK: }
+! After inner loop: uses original b, not cached
+! CHECK: hlfir.designate %[[B_VAR]]#0
+! CHECK: acc.yield
+end subroutine
+
+
 ! CHECK-LABEL: func.func @_QPtest_cache_in_regular_loop()
+! CHECK: %[[B_VAR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_cache_in_regular_loopEb"}
 subroutine test_cache_in_regular_loop()
   integer, parameter :: n = 10
   real, dimension(n) :: a, b
@@ -468,9 +504,9 @@ subroutine test_cache_in_regular_loop()
   end do
 
 ! CHECK: fir.do_loop
-! CHECK: acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}})
-! CHECK: hlfir.declare
-! CHECK: hlfir.designate
+! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%[[B_VAR]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}})
+! CHECK: %[[CACHE_DECL:.*]]:2 = hlfir.declare %[[CACHE]]
+! CHECK: hlfir.designate %[[CACHE_DECL]]#0
 ! CHECK: fir.load
 ! CHECK: hlfir.assign
-end subroutine
\ No newline at end of file
+end subroutine

>From ffc4f86e6191a947a91f97c94716419529b0507f Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Fri, 9 Jan 2026 08:54:37 -0800
Subject: [PATCH 20/21] Remove unnecessary newlines

---
 flang/test/Lower/OpenACC/acc-cache.f90 | 2 --
 1 file changed, 2 deletions(-)

diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index c9dc7ce0f7955..3fe668f29fdff 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -456,7 +456,6 @@ subroutine test_cache_use_after_region()
 ! CHECK: hlfir.assign
 end subroutine
 
-
 ! CHECK-LABEL: func.func @_QPtest_cache_nested_scope()
 ! CHECK: %[[B_VAR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_cache_nested_scopeEb"}
 subroutine test_cache_nested_scope()
@@ -489,7 +488,6 @@ subroutine test_cache_nested_scope()
 ! CHECK: acc.yield
 end subroutine
 
-
 ! CHECK-LABEL: func.func @_QPtest_cache_in_regular_loop()
 ! CHECK: %[[B_VAR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_cache_in_regular_loopEb"}
 subroutine test_cache_in_regular_loop()

>From b3aabdf67fd54400f6a57f05911b10f38372a04d Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Fri, 9 Jan 2026 11:58:06 -0800
Subject: [PATCH 21/21] Add symbol scope management for constructs inside
 acc.loop

---
 flang/lib/Lower/Bridge.cpp             | 11 ++++++
 flang/test/Lower/OpenACC/acc-cache.f90 | 52 ++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 69bb4a40af41f..c1b3913cb15ec 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -6193,6 +6193,17 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                           ? eval.getFirstNestedEvaluation().block
                           : eval.block);
 
+    // Add scope for constructs inside acc.loop to properly contain symbol
+    // bindings (e.g., from cache directive) within the construct.
+    bool needsAccScope =
+        eval.isConstruct() && Fortran::lower::isInOpenACCLoop(*builder);
+    if (needsAccScope)
+      localSymbols.pushScope();
+    auto popAccScope = llvm::make_scope_exit([&]() {
+      if (needsAccScope)
+        localSymbols.popScope();
+    });
+
     // Generate evaluation specific code. Even nop calls should usually reach
     // here in case they start a new block or require generation of a generic
     // end-of-block branch. An alternative is to add special case code
diff --git a/flang/test/Lower/OpenACC/acc-cache.f90 b/flang/test/Lower/OpenACC/acc-cache.f90
index 3fe668f29fdff..1cfe064993160 100644
--- a/flang/test/Lower/OpenACC/acc-cache.f90
+++ b/flang/test/Lower/OpenACC/acc-cache.f90
@@ -508,3 +508,55 @@ subroutine test_cache_in_regular_loop()
 ! CHECK: fir.load
 ! CHECK: hlfir.assign
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_in_if
+! CHECK: %[[B_VAR:.*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFtest_cache_in_ifEb"}
+subroutine test_cache_in_if(a, b, cache)
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+  logical :: cache
+  integer :: i
+
+  !$acc loop
+  do i = 1, n
+    if (cache) then
+      !$acc cache(b)
+    end if
+    a(i) = b(i)
+  end do
+
+! CHECK: acc.loop
+! CHECK: fir.if
+! CHECK: acc.cache varPtr(%[[B_VAR]]#0 : !fir.ref<!fir.array<10xf32>>)
+! CHECK: }
+! After IF: uses original b, not cached version
+! CHECK: hlfir.designate %[[B_VAR]]#0
+! CHECK: acc.yield
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cache_in_nested_do
+! CHECK: %[[B_VAR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_cache_in_nested_doEb"}
+subroutine test_cache_in_nested_do()
+  integer, parameter :: n = 1000, m = 100, l = 100
+  real, dimension(n, m, l) :: a, b
+  integer :: i, j
+
+  !$acc loop
+  do i = 1, n
+    do j = 1, m, 2
+      !$acc cache(b(i,m,j))
+    end do
+
+    do j = 1, m, 2
+      a(i, m, :) = b(i, m, :)
+    end do
+  end do
+
+! CHECK: acc.loop
+! First inner DO loop with cache
+! CHECK: fir.do_loop
+! CHECK: acc.cache varPtr(%[[B_VAR]]#0 : !fir.ref<!fir.array<1000x100x100xf32>>) bounds
+! Second inner DO loop: uses original b, not cached version
+! CHECK: fir.do_loop
+! CHECK: hlfir.designate %[[B_VAR]]#0
+end subroutine



More information about the flang-commits mailing list