[flang-commits] [flang] [Flang] Add opt-in affine loop optimization pipeline (PR #191854)

Wed Apr 15 04:51:33 PDT 2026

https://github.com/shuyadav-dev updated https://github.com/llvm/llvm-project/pull/191854

>From f0e6e642c66aa124deffafdc19872a27578fe55f Mon Sep 17 00:00:00 2001
From: Shubham Yadav <shuyadav at amd.com>
Date: Mon, 13 Apr 2026 22:11:42 +0530
Subject: [PATCH 1/3] [Flang] Add opt-in affine loop optimization pipeline Add
 a new SimplifyDoLoop canonicalization pass and enhance the existing
 AffinePromotion and AffineDemotion passes to enable MLIR affine loop
 transformations (tiling, fusion, interchange) on Fortran DO loops. The
 pipeline is gated behind --enable-affine-loop-opt

---
 .../flang/Optimizer/Passes/CommandLineOpts.h  |   2 +
 .../flang/Optimizer/Transforms/Passes.h       |   1 +
 .../flang/Optimizer/Transforms/Passes.td      |  26 +
 .../lib/Optimizer/Passes/CommandLineOpts.cpp  |   6 +
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  51 ++
 .../Optimizer/Transforms/AffineDemotion.cpp   | 237 ++++++-
 .../Optimizer/Transforms/AffinePromotion.cpp  | 359 ++++++----
 flang/lib/Optimizer/Transforms/CMakeLists.txt |   1 +
 .../Optimizer/Transforms/SimplifyDoLoop.cpp   | 639 ++++++++++++++++++
 flang/test/Fir/affine-demotion.fir            | 174 ++++-
 flang/test/Fir/affine-promotion.fir           | 151 ++++-
 flang/test/Transforms/simplify-do-loop.fir    | 322 +++++++++
 12 files changed, 1758 insertions(+), 211 deletions(-)
 create mode 100644 flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
 create mode 100644 flang/test/Transforms/simplify-do-loop.fir

diff --git a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
index 882f02032a3b8..4b48cc2abe165 100644
--- a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
+++ b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
@@ -56,6 +56,8 @@ extern llvm::cl::opt<bool> disableFirAliasTags;
 extern llvm::cl::opt<bool> disableFirAvc;
 extern llvm::cl::opt<bool> disableFirMao;
 extern llvm::cl::opt<bool> enableFirLICM;
+extern llvm::cl::opt<bool> enableAffineLoopOpt;
+extern llvm::cl::opt<unsigned> affineLoopOptTileSize;
 extern llvm::cl::opt<bool> useOldAliasTags;
 
 /// CodeGen Passes
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index adacd3cc0cf51..6e2170a3a23a2 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -45,6 +45,7 @@ enum class LICMNestedHoistingMode {
 #include "flang/Optimizer/Transforms/Passes.h.inc"
 
 std::unique_ptr<mlir::Pass> createAffineDemotionPass();
+std::unique_ptr<mlir::Pass> createSimplifyDoLoopPass();
 std::unique_ptr<mlir::Pass>
 createArrayValueCopyPass(fir::ArrayValueCopyOptions options = {});
 std::unique_ptr<mlir::Pass> createMemDataFlowOptPass();
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 71c9f7b62d2be..f5f88ca4b8574 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -35,6 +35,32 @@ def AbstractResultOpt
   ];
 }
 
+def SimplifyDoLoop : Pass<"simplify-fir-loop", "::mlir::func::FuncOp"> {
+  let summary = "Canonicalize fir.do_loop nests for affine promotion.";
+  let description = [{
+    General-purpose FIR loop canonicalization pass.  Transforms perfectly nested
+    fir.do_loop nests into a canonical form suitable for affine promotion and
+    loop optimizations (tiling, fusion, interchange, etc.).
+
+    Analysis phase (per nest):
+    1. Verifies each iter_arg is a shadow of the loop induction variable.
+    2. Builds an IV map: { loop -> (ivAlloca, lb, ub, step, ivType) }.
+    3. Checks safety: single IV store, no calls, IV doesn't escape.
+
+    Transformation phase:
+    1. Removes iter_args, init/final stores, and IV increment ops.
+    2. Forwards loads of the IV alloca to fir.convert(induction_var).
+    3. Emits final IV value stores after the outermost loop using the
+       Fortran formula: final = lb + ((ub - lb + step) / step) * step.
+  }];
+  let constructor = "::fir::createSimplifyDoLoopPass()";
+  let dependentDialects = [
+    "fir::FIROpsDialect", "mlir::func::FuncDialect",
+    "mlir::arith::ArithDialect"
+  ];
+}
+
+
 def AffineDialectPromotion : Pass<"promote-to-affine", "::mlir::func::FuncOp"> {
   let summary = "Promotes `fir.{do_loop,if}` to `affine.{for,if}`.";
   let description = [{
diff --git a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
index d461c1b9757b5..d137eb82054dd 100644
--- a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
+++ b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
@@ -62,6 +62,12 @@ cl::opt<bool> useOldAliasTags(
              "the FIR alias tags pass"),
     cl::init(false), cl::Hidden);
 EnableOption(FirLICM, "fir-licm", "FIR loop invariant code motion");
+EnableOption(AffineLoopOpt, "affine-loop-opt",
+             "affine loop optimizations (tiling, fusion, interchange)");
+cl::opt<unsigned> affineLoopOptTileSize(
+    "affine-loop-opt-tile-size",
+    cl::desc("tile size for affine loop tiling (0 = auto from cache model)"),
+    cl::init(0), cl::Hidden);
 
 /// CodeGen Passes
 DisableOption(CodeGenRewrite, "codegen-rewrite", "rewrite FIR for codegen");
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 73e647a1c3956..0b09ac5c2f37e 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -10,6 +10,9 @@
 /// common to flang and the test tools.
 
 #include "flang/Optimizer/Passes/Pipelines.h"
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Affine/Transforms/Passes.h"
+#include "mlir/Pass/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 
 /// Force setting the no-alias attribute on fuction arguments when possible.
@@ -191,6 +194,54 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
   config.setRegionSimplificationLevel(
       mlir::GreedySimplifyRegionLevel::Disabled);
   pm.addPass(mlir::createCSEPass());
+
+  // Affine loop optimization pipeline (opt-in via --enable-affine-loop-opt).
+  if (enableAffineLoopOpt) {
+    pm.addPass(mlir::createCanonicalizerPass(config));
+    pm.addPass(mlir::createCSEPass());
+
+    addNestedPassToAllTopLevelOperations<PassConstructor>(
+        pm, fir::createSimplifyDoLoopPass);
+
+    pm.addPass(mlir::createCanonicalizerPass(config));
+    pm.addPass(mlir::createCSEPass());
+
+    pm.addPass(mlir::createLoopInvariantCodeMotionPass());
+    pm.addPass(fir::createLoopInvariantCodeMotion());
+
+    pm.addPass(mlir::createCanonicalizerPass(config));
+    pm.addPass(mlir::createCSEPass());
+
+    pm.addPass(fir::createPromoteToAffinePass());
+
+    // Use remove-dead-values instead of canonicalize between promotion and
+    // demotion to avoid folding fir.convert chains.  Canonicalize can merge
+    // a linearisation convert (ref<NxM> -> ref<N*M>) with the promotion
+    // convert (ref<N*M> -> memref<N*M>) into a single ref<NxM> -> memref<N*M>,
+    // which would cause a rank mismatch in AffineDemotion.
+    pm.addPass(mlir::createRemoveDeadValuesPass());
+    pm.addPass(mlir::createCSEPass());
+
+    if (affineLoopOptTileSize > 0) {
+      mlir::affine::registerAffineLoopTiling();
+      std::string pipeline = "func.func(affine-loop-tile{tile-size=" +
+                             std::to_string(affineLoopOptTileSize) + "})";
+      (void)mlir::parsePassPipeline(pipeline, pm);
+    } else {
+      pm.addNestedPass<mlir::func::FuncOp>(
+          mlir::affine::createLoopTilingPass());
+    }
+
+    pm.addPass(mlir::createRemoveDeadValuesPass());
+    pm.addPass(mlir::createCSEPass());
+
+    pm.addPass(fir::createAffineDemotionPass());
+    pm.addPass(mlir::createLowerAffinePass());
+
+    pm.addPass(mlir::createCanonicalizerPass(config));
+    pm.addPass(mlir::createCSEPass());
+  }
+
   fir::addAVC(pm, pc.OptLevel);
   addNestedPassToAllTopLevelOperations<PassConstructor>(
       pm, fir::createCharacterConversion);
diff --git a/flang/lib/Optimizer/Transforms/AffineDemotion.cpp b/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
index 430ef62a3a55d..4b5f025f4bf36 100644
--- a/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
@@ -22,6 +22,7 @@
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -33,6 +34,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include <algorithm>
 
 namespace fir {
 #define GEN_PASS_DEF_AFFINEDIALECTDEMOTION
@@ -46,6 +48,82 @@ using namespace mlir;
 
 namespace {
 
+/// Check whether the FIR base reference points to an array with
+/// dynamic (runtime-determined) extents, e.g. `!fir.ref<!fir.array<?x?xf32>>`.
+/// `fir.coordinate_of` cannot handle such arrays because it needs
+/// compile-time-known dimensions to linearise the multi-dimensional index.
+static bool baseHasDynamicExtents(mlir::Value base) {
+  mlir::Type ty = base.getType();
+  if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(ty))
+    ty = refTy.getEleTy();
+  else if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
+    ty = heapTy.getEleTy();
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
+    return seqTy.hasDynamicExtents();
+  return false;
+}
+
+/// Convert 0-based memref indices (already reversed to column-major order)
+/// to Fortran indices expected by fir.array_coor.
+///
+/// For fir.shape (implicit lb=1):    Fortran_idx = 0based + 1
+/// For fir.shape_shift (explicit lb): Fortran_idx = 0based + lb_k
+static SmallVector<Value>
+toFortranIndices(mlir::Value shape, ArrayRef<Value> zeroBasedIndices,
+                 mlir::Location loc, ConversionPatternRewriter &rewriter) {
+  SmallVector<Value> result;
+
+  if (auto shapeShiftOp = shape.getDefiningOp<fir::ShapeShiftOp>()) {
+    auto pairs = shapeShiftOp.getPairs();
+    for (unsigned k = 0; k < zeroBasedIndices.size(); ++k) {
+      mlir::Value lb = pairs[k * 2]; // lower bound for dimension k
+      result.push_back(
+          arith::AddIOp::create(rewriter, loc, zeroBasedIndices[k], lb));
+    }
+  } else {
+    // fir.shape or anything else — lower bound is 1
+    auto one = arith::ConstantIndexOp::create(rewriter, loc, 1);
+    for (auto idx : zeroBasedIndices)
+      result.push_back(arith::AddIOp::create(rewriter, loc, idx, one));
+  }
+  return result;
+}
+
+/// Walk backwards from `base` to locate the `fir.shape` (or shapeshift)
+/// that carries the runtime dimension sizes.
+///
+/// Handles three cases:
+///   1. Explicit-shape arrays: base is from fir.declare → shape is attached.
+///   2. Local allocatable arrays: base is from fir.allocmem → find the
+///      fir.embox that wraps it and recover the shape from there.
+///   3. Allocatable dummy / module arrays: base is from fir.box_addr →
+///      use the original fir.box directly with fir.array_coor (the box
+///      carries all shape info).  In this case `outBoxBase` is set to the
+///      box value and the returned shape is null.
+static mlir::Value findShapeForBase(mlir::Value base, mlir::Value &outBoxBase) {
+  outBoxBase = mlir::Value{};
+
+  // Case 1: explicit-shape via fir.declare
+  if (auto declareOp = base.getDefiningOp<fir::DeclareOp>())
+    return declareOp.getShape();
+
+  // Case 2: local allocatable — find fir.embox that wraps this heap pointer
+  if (base.getDefiningOp<fir::AllocMemOp>()) {
+    for (auto *user : base.getUsers()) {
+      if (auto embox = mlir::dyn_cast<fir::EmboxOp>(user))
+        return embox.getShape();
+    }
+  }
+
+  // Case 3: allocatable dummy arg / module array — base is from fir.box_addr
+  if (auto boxAddr = base.getDefiningOp<fir::BoxAddrOp>()) {
+    outBoxBase = boxAddr.getVal();
+    return mlir::Value{};
+  }
+
+  return mlir::Value{};
+}
+
 class AffineLoadConversion
     : public OpConversionPattern<mlir::affine::AffineLoadOp> {
 public:
@@ -60,12 +138,69 @@ class AffineLoadConversion
     if (!maybeExpandedMap)
       return failure();
 
-    auto coorOp = fir::CoordinateOp::create(
-        rewriter, op.getLoc(),
-        fir::ReferenceType::get(op.getResult().getType()), adaptor.getMemref(),
-        *maybeExpandedMap);
+    auto expandedIndices = *maybeExpandedMap;
 
-    rewriter.replaceOpWithNewOp<fir::LoadOp>(op, coorOp.getResult());
+    // AffinePromotion reverses dimension order (column-major FIR → row-major
+    // memref) and index order.  Reverse indices back for fir.coordinate_of
+    // which uses Fortran's column-major layout.
+    // ConvertConversion already strips the single fir.convert (FIR -> memref)
+    // that AffinePromotion created, so `base` is the original FIR value.
+    // Do NOT trace through any remaining fir.convert — those belong to the
+    // source IR (e.g. linearisation converts from -O2 whole-array lowering).
+    Value base = adaptor.getMemref();
+
+    auto hasSequenceType = [](mlir::Type ty) -> bool {
+      if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(ty))
+        return mlir::isa<fir::SequenceType>(refTy.getEleTy());
+      if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
+        return mlir::isa<fir::SequenceType>(heapTy.getEleTy());
+      return false;
+    };
+
+    if (!hasSequenceType(base.getType()))
+      return op.emitError(
+          "unsupported memref base: expected !fir.ref<!fir.array<...>> or "
+          "!fir.heap<!fir.array<...>>; fir.box and plain memref bases "
+          "are not yet handled by AffineDemotion");
+
+    std::reverse(expandedIndices.begin(), expandedIndices.end());
+
+    auto resultRefTy = fir::ReferenceType::get(op.getResult().getType());
+
+    if (baseHasDynamicExtents(base)) {
+      mlir::Value boxBase;
+      mlir::Value shape = findShapeForBase(base, boxBase);
+
+      if (shape) {
+        auto fortranIndices =
+            toFortranIndices(shape, expandedIndices, op.getLoc(), rewriter);
+        auto arrayCoorOp = fir::ArrayCoorOp::create(
+            rewriter, op.getLoc(), resultRefTy, base, shape,
+            /*slice=*/mlir::Value{}, fortranIndices,
+            /*typeparams=*/mlir::ValueRange{});
+        rewriter.replaceOpWithNewOp<fir::LoadOp>(op, arrayCoorOp.getResult());
+      } else if (boxBase) {
+        // Case 3: box carries shape — use box directly; lb=1 assumed
+        auto one = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 1);
+        SmallVector<Value> oneBasedIndices;
+        for (auto idx : expandedIndices)
+          oneBasedIndices.push_back(
+              arith::AddIOp::create(rewriter, op.getLoc(), idx, one));
+        auto arrayCoorOp = fir::ArrayCoorOp::create(
+            rewriter, op.getLoc(), resultRefTy, boxBase,
+            /*shape=*/mlir::Value{},
+            /*slice=*/mlir::Value{}, oneBasedIndices,
+            /*typeparams=*/mlir::ValueRange{});
+        rewriter.replaceOpWithNewOp<fir::LoadOp>(op, arrayCoorOp.getResult());
+      } else {
+        return op.emitError(
+            "cannot find shape or box for dynamic-extent array");
+      }
+    } else {
+      auto coorOp = fir::CoordinateOp::create(
+          rewriter, op.getLoc(), resultRefTy, base, expandedIndices);
+      rewriter.replaceOpWithNewOp<fir::LoadOp>(op, coorOp.getResult());
+    }
     return success();
   }
 };
@@ -84,12 +219,64 @@ class AffineStoreConversion
     if (!maybeExpandedMap)
       return failure();
 
-    auto coorOp = fir::CoordinateOp::create(
-        rewriter, op.getLoc(),
-        fir::ReferenceType::get(op.getValueToStore().getType()),
-        adaptor.getMemref(), *maybeExpandedMap);
-    rewriter.replaceOpWithNewOp<fir::StoreOp>(op, adaptor.getValue(),
-                                              coorOp.getResult());
+    auto expandedIndices = *maybeExpandedMap;
+
+    Value base = adaptor.getMemref();
+
+    auto hasSequenceType = [](mlir::Type ty) -> bool {
+      if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(ty))
+        return mlir::isa<fir::SequenceType>(refTy.getEleTy());
+      if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
+        return mlir::isa<fir::SequenceType>(heapTy.getEleTy());
+      return false;
+    };
+
+    if (!hasSequenceType(base.getType()))
+      return op.emitError(
+          "unsupported memref base: expected !fir.ref<!fir.array<...>> or "
+          "!fir.heap<!fir.array<...>>; fir.box and plain memref bases "
+          "are not yet handled by AffineDemotion");
+
+    std::reverse(expandedIndices.begin(), expandedIndices.end());
+
+    auto resultRefTy = fir::ReferenceType::get(op.getValueToStore().getType());
+
+    if (baseHasDynamicExtents(base)) {
+      mlir::Value boxBase;
+      mlir::Value shape = findShapeForBase(base, boxBase);
+
+      if (shape) {
+        auto fortranIndices =
+            toFortranIndices(shape, expandedIndices, op.getLoc(), rewriter);
+        auto arrayCoorOp = fir::ArrayCoorOp::create(
+            rewriter, op.getLoc(), resultRefTy, base, shape,
+            /*slice=*/mlir::Value{}, fortranIndices,
+            /*typeparams=*/mlir::ValueRange{});
+        rewriter.replaceOpWithNewOp<fir::StoreOp>(op, adaptor.getValue(),
+                                                  arrayCoorOp.getResult());
+      } else if (boxBase) {
+        auto one = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 1);
+        SmallVector<Value> oneBasedIndices;
+        for (auto idx : expandedIndices)
+          oneBasedIndices.push_back(
+              arith::AddIOp::create(rewriter, op.getLoc(), idx, one));
+        auto arrayCoorOp = fir::ArrayCoorOp::create(
+            rewriter, op.getLoc(), resultRefTy, boxBase,
+            /*shape=*/mlir::Value{},
+            /*slice=*/mlir::Value{}, oneBasedIndices,
+            /*typeparams=*/mlir::ValueRange{});
+        rewriter.replaceOpWithNewOp<fir::StoreOp>(op, adaptor.getValue(),
+                                                  arrayCoorOp.getResult());
+      } else {
+        return op.emitError(
+            "cannot find shape or box for dynamic-extent array");
+      }
+    } else {
+      auto coorOp = fir::CoordinateOp::create(
+          rewriter, op.getLoc(), resultRefTy, base, expandedIndices);
+      rewriter.replaceOpWithNewOp<fir::StoreOp>(op, adaptor.getValue(),
+                                                coorOp.getResult());
+    }
     return success();
   }
 };
@@ -101,22 +288,18 @@ class ConvertConversion : public mlir::OpRewritePattern<fir::ConvertOp> {
   matchAndRewrite(fir::ConvertOp op,
                   mlir::PatternRewriter &rewriter) const override {
     if (mlir::isa<mlir::MemRefType>(op.getRes().getType())) {
-      // due to index calculation moving to affine maps we still need to
-      // add converts for sequence types this has a side effect of losing
-      // some information about arrays with known dimensions by creating:
-      // fir.convert %arg0 : (!fir.ref<!fir.array<5xi32>>) ->
-      // !fir.ref<!fir.array<?xi32>>
-      if (auto refTy =
-              mlir::dyn_cast<fir::ReferenceType>(op.getValue().getType()))
-        if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(refTy.getEleTy())) {
-          fir::SequenceType::Shape flatShape = {
-              fir::SequenceType::getUnknownExtent()};
-          auto flatArrTy = fir::SequenceType::get(flatShape, arrTy.getEleTy());
-          auto flatTy = fir::ReferenceType::get(flatArrTy);
-          rewriter.replaceOpWithNewOp<fir::ConvertOp>(op, flatTy,
-                                                      op.getValue());
-          return success();
-        }
+      mlir::Type srcTy = op.getValue().getType();
+      auto getSeqTy = [](mlir::Type t) -> fir::SequenceType {
+        if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(t))
+          return mlir::dyn_cast<fir::SequenceType>(refTy.getEleTy());
+        if (auto heapTy = mlir::dyn_cast<fir::HeapType>(t))
+          return mlir::dyn_cast<fir::SequenceType>(heapTy.getEleTy());
+        return {};
+      };
+      if (getSeqTy(srcTy)) {
+        rewriter.replaceOp(op, op.getValue());
+        return success();
+      }
       rewriter.replaceOp(op, op.getValue());
     }
     return success();
diff --git a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
index bdc34186a713b..da9364e62682d 100644
--- a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
@@ -63,22 +63,117 @@ struct AffineFunctionAnalysis {
 };
 } // namespace
 
-static bool analyzeCoordinate(mlir::Value coordinate, mlir::Operation *op) {
-  if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(coordinate)) {
-    if (isa<fir::DoLoopOp>(blockArg.getOwner()->getParentOp()))
-      return true;
-    LLVM_DEBUG(llvm::dbgs() << "AffineLoopAnalysis: array coordinate is not a "
-                               "loop induction variable (owner not loopOp)\n";
-               op->dump());
+/// Recursively checks whether a value can be expressed as an affine function
+/// of loop induction variables and integer constants.  Walks through
+/// fir.convert (type-cast), arith.addi, arith.subi, and arith.muli (the
+/// latter only when at least one operand is a compile-time constant so the
+/// result stays within MLIR's strict affine expression rules).
+static bool isAffineIndex(mlir::Value val, unsigned depth = 0) {
+  if (depth > 16)
+    return false;
+
+  if (auto conv = val.getDefiningOp<fir::ConvertOp>())
+    return isAffineIndex(conv.getValue(), depth + 1);
+
+  if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(val))
+    return isa<fir::DoLoopOp>(blockArg.getOwner()->getParentOp()) ||
+           isa<mlir::affine::AffineForOp>(blockArg.getOwner()->getParentOp());
+
+  auto *defOp = val.getDefiningOp();
+  if (!defOp)
+    return false;
+
+  if (isa<mlir::arith::ConstantOp>(defOp))
+    return true;
+
+  if (auto add = dyn_cast<mlir::arith::AddIOp>(defOp))
+    return isAffineIndex(add.getLhs(), depth + 1) &&
+           isAffineIndex(add.getRhs(), depth + 1);
+
+  if (auto sub = dyn_cast<mlir::arith::SubIOp>(defOp))
+    return isAffineIndex(sub.getLhs(), depth + 1) &&
+           isAffineIndex(sub.getRhs(), depth + 1);
+
+  if (auto mul = dyn_cast<mlir::arith::MulIOp>(defOp)) {
+    auto *lhsDef = mul.getLhs().getDefiningOp();
+    auto *rhsDef = mul.getRhs().getDefiningOp();
+    if ((lhsDef && isa<mlir::arith::ConstantOp>(lhsDef)) ||
+        (rhsDef && isa<mlir::arith::ConstantOp>(rhsDef)))
+      return isAffineIndex(mul.getLhs(), depth + 1) &&
+             isAffineIndex(mul.getRhs(), depth + 1);
     return false;
   }
-  LLVM_DEBUG(
-      llvm::dbgs() << "AffineLoopAnalysis: array coordinate is not a loop "
-                      "induction variable (not a block argument)\n";
-      op->dump(); coordinate.getDefiningOp()->dump());
+
+  LLVM_DEBUG(llvm::dbgs() << "AffineLoopAnalysis: index is not an affine "
+                             "expression of loop IVs\n";
+             defOp->dump());
   return false;
 }
 
+/// Builds an mlir::AffineExpr by recursively walking the FIR/arith expression
+/// tree rooted at a fir.array_coor index value.  Loop induction variables
+/// become affine dimensions; integer constants are folded into the expression.
+struct AffineIndexBuilder {
+  using MaybeExpr = std::optional<mlir::AffineExpr>;
+
+  explicit AffineIndexBuilder(mlir::MLIRContext *ctx) : context(ctx) {}
+
+  MaybeExpr build(mlir::Value val) {
+    if (auto conv = val.getDefiningOp<fir::ConvertOp>())
+      return build(conv.getValue());
+
+    if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(val)) {
+      if (isa<fir::DoLoopOp>(blockArg.getOwner()->getParentOp()) ||
+          isa<mlir::affine::AffineForOp>(blockArg.getOwner()->getParentOp())) {
+        for (unsigned i = 0; i < dims.size(); ++i)
+          if (dims[i] == val)
+            return mlir::getAffineDimExpr(i, context);
+        unsigned idx = dims.size();
+        dims.push_back(val);
+        return mlir::getAffineDimExpr(idx, context);
+      }
+      return {};
+    }
+
+    auto *defOp = val.getDefiningOp();
+    if (!defOp)
+      return {};
+
+    if (auto op = dyn_cast<mlir::arith::ConstantOp>(defOp))
+      if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(op.getValue()))
+        return mlir::getAffineConstantExpr(intAttr.getInt(), context);
+
+    if (auto op = dyn_cast<mlir::arith::AddIOp>(defOp)) {
+      auto lhs = build(op.getLhs());
+      auto rhs = build(op.getRhs());
+      if (lhs && rhs)
+        return *lhs + *rhs;
+      return {};
+    }
+
+    if (auto op = dyn_cast<mlir::arith::SubIOp>(defOp)) {
+      auto lhs = build(op.getLhs());
+      auto rhs = build(op.getRhs());
+      if (lhs && rhs)
+        return *lhs - *rhs;
+      return {};
+    }
+
+    if (auto op = dyn_cast<mlir::arith::MulIOp>(defOp)) {
+      auto lhs = build(op.getLhs());
+      auto rhs = build(op.getRhs());
+      if (lhs && rhs)
+        return *lhs * *rhs;
+      return {};
+    }
+
+    return {};
+  }
+
+  mlir::MLIRContext *context;
+  llvm::SmallVector<mlir::Value> dims;
+};
+
 namespace {
 struct AffineLoopAnalysis {
   AffineLoopAnalysis() = default;
@@ -134,7 +229,7 @@ struct AffineLoopAnalysis {
       }
       bool canPromote = true;
       for (auto coordinate : acoOp.getIndices())
-        canPromote = canPromote && analyzeCoordinate(coordinate, op);
+        canPromote = canPromote && isAffineIndex(coordinate);
       return canPromote;
     }
     if (auto coOp = memref.getDefiningOp<CoordinateOp>()) {
@@ -322,27 +417,6 @@ AffineFunctionAnalysis::getChildIfAnalysis(fir::IfOp op) const {
   return it->getSecond();
 }
 
-/// AffineMap rewriting fir.array_coor operation to affine apply,
-/// %dim = fir.gendim %lowerBound, %upperBound, %stride
-/// %a = fir.array_coor %arr(%dim) %i
-/// returning affineMap = affine_map<(i)[lb, ub, st] -> (i*st - lb)>
-static mlir::AffineMap createArrayIndexAffineMap(unsigned dimensions,
-                                                 MLIRContext *context) {
-  auto index = mlir::getAffineConstantExpr(0, context);
-  auto accuExtent = mlir::getAffineConstantExpr(1, context);
-  for (unsigned i = 0; i < dimensions; ++i) {
-    mlir::AffineExpr idx = mlir::getAffineDimExpr(i, context),
-                     lowerBound = mlir::getAffineSymbolExpr(i * 3, context),
-                     currentExtent =
-                         mlir::getAffineSymbolExpr(i * 3 + 1, context),
-                     stride = mlir::getAffineSymbolExpr(i * 3 + 2, context),
-                     currentPart = (idx * stride - lowerBound) * accuExtent;
-    index = currentPart + index;
-    accuExtent = accuExtent * currentExtent;
-  }
-  return mlir::AffineMap::get(dimensions, dimensions * 3, index);
-}
-
 static std::optional<int64_t> constantIntegerLike(const mlir::Value value) {
   if (auto definition = value.getDefiningOp<mlir::arith::ConstantOp>())
     if (auto stepAttr = mlir::dyn_cast<IntegerAttr>(definition.getValue()))
@@ -350,104 +424,111 @@ static std::optional<int64_t> constantIntegerLike(const mlir::Value value) {
   return {};
 }
 
-static mlir::Type coordinateArrayElement(fir::ArrayCoorOp op) {
-  if (auto refType =
-          mlir::dyn_cast_or_null<ReferenceType>(op.getMemref().getType())) {
-    if (auto seqType =
-            mlir::dyn_cast_or_null<SequenceType>(refType.getEleTy())) {
-      return seqType.getEleTy();
-    }
-  }
-  op.emitError(
-      "AffineLoopConversion: array type in coordinate operation not valid\n");
-  return mlir::Type();
-}
-
-static void populateIndexArgs(fir::ArrayCoorOp acoOp, fir::ShapeOp shape,
-                              SmallVectorImpl<mlir::Value> &indexArgs,
-                              mlir::PatternRewriter &rewriter) {
-  auto one = mlir::arith::ConstantOp::create(rewriter, acoOp.getLoc(),
-                                             rewriter.getIndexType(),
-                                             rewriter.getIndexAttr(1));
-  auto extents = shape.getExtents();
-  for (auto i = extents.begin(); i < extents.end(); i++) {
-    indexArgs.push_back(one);
-    indexArgs.push_back(*i);
-    indexArgs.push_back(one);
-  }
-}
+/// Holds the result of creating multi-dimensional affine operations.
+struct MultiDimAffineResult {
+  SmallVector<mlir::Value> indices;
+  fir::ConvertOp arrayConvert;
+};
 
-static void populateIndexArgs(fir::ArrayCoorOp acoOp, fir::ShapeShiftOp shape,
-                              SmallVectorImpl<mlir::Value> &indexArgs,
-                              mlir::PatternRewriter &rewriter) {
-  auto one = mlir::arith::ConstantOp::create(rewriter, acoOp.getLoc(),
-                                             rewriter.getIndexType(),
-                                             rewriter.getIndexAttr(1));
-  auto extents = shape.getPairs();
-  for (auto i = extents.begin(); i < extents.end();) {
-    indexArgs.push_back(*i++);
-    indexArgs.push_back(*i++);
-    indexArgs.push_back(one);
-  }
-}
+/// Creates multi-dimensional affine operations preserving array dimensionality.
+/// Instead of linearizing all indices into a single 1D offset, this extracts
+/// the array shape from the FIR SequenceType, creates a matching multi-dim
+/// MemRefType, and adjusts each per-dimension index from Fortran 1-based to
+/// memref 0-based indexing.
+static MultiDimAffineResult
+createMultiDimAffineOps(mlir::Value arrayRef, mlir::PatternRewriter &rewriter) {
+  auto acoOp = arrayRef.getDefiningOp<ArrayCoorOp>();
+  auto loc = acoOp.getLoc();
+  auto *context = acoOp.getContext();
 
-static void populateIndexArgs(fir::ArrayCoorOp acoOp, fir::SliceOp slice,
-                              SmallVectorImpl<mlir::Value> &indexArgs,
-                              mlir::PatternRewriter &rewriter) {
-  auto extents = slice.getTriples();
-  for (auto i = extents.begin(); i < extents.end();) {
-    indexArgs.push_back(*i++);
-    indexArgs.push_back(*i++);
-    indexArgs.push_back(*i++);
+  fir::SequenceType seqType;
+  if (auto refType =
+          mlir::dyn_cast<fir::ReferenceType>(acoOp.getMemref().getType()))
+    seqType = mlir::dyn_cast<fir::SequenceType>(refType.getEleTy());
+  else if (auto heapType =
+               mlir::dyn_cast<fir::HeapType>(acoOp.getMemref().getType()))
+    seqType = mlir::dyn_cast<fir::SequenceType>(heapType.getEleTy());
+
+  // need change because memref is row major order but fir.array is column major
+  // order]=
+  SmallVector<int64_t> reversedShape(seqType.getShape().rbegin(),
+                                     seqType.getShape().rend());
+
+  auto newType = mlir::MemRefType::get(reversedShape, seqType.getEleTy());
+  auto arrayConvert =
+      fir::ConvertOp::create(rewriter, loc, newType, acoOp.getMemref());
+
+  SmallVector<mlir::Value> adjustedIndices;
+  auto indices = acoOp.getIndices();
+
+  if (auto shapeOp = acoOp.getShape().getDefiningOp<ShapeOp>()) {
+    for (auto idx : indices) {
+      AffineIndexBuilder builder(context);
+      auto expr = builder.build(idx);
+      assert(expr && "analysis guaranteed index is affine");
+      auto adjustedExpr = *expr - 1;
+      auto map = mlir::AffineMap::get(builder.dims.size(), 0, adjustedExpr);
+      auto adjusted =
+          affine::AffineApplyOp::create(rewriter, loc, map, builder.dims);
+      adjustedIndices.push_back(adjusted.getResult());
+    }
+  } else if (auto shapeShiftOp =
+                 acoOp.getShape().getDefiningOp<ShapeShiftOp>()) {
+    auto pairs = shapeShiftOp.getPairs();
+    for (unsigned i = 0; i < indices.size(); ++i) {
+      AffineIndexBuilder builder(context);
+      auto expr = builder.build(indices[i]);
+      assert(expr && "analysis guaranteed index is affine");
+      auto adjustedExpr = *expr - mlir::getAffineSymbolExpr(0, context);
+      auto map = mlir::AffineMap::get(builder.dims.size(), 1, adjustedExpr);
+      SmallVector<mlir::Value> operands;
+      operands.append(builder.dims.begin(), builder.dims.end());
+      operands.push_back(pairs[i * 2]);
+      auto adjusted =
+          affine::AffineApplyOp::create(rewriter, loc, map, operands);
+      adjustedIndices.push_back(adjusted.getResult());
+    }
+  } else if (auto sliceOp = acoOp.getShape().getDefiningOp<SliceOp>()) {
+    auto triples = sliceOp.getTriples();
+    for (unsigned i = 0; i < indices.size(); ++i) {
+      AffineIndexBuilder builder(context);
+      auto expr = builder.build(indices[i]);
+      assert(expr && "analysis guaranteed index is affine");
+      auto lbSym = mlir::getAffineSymbolExpr(0, context);
+      auto strideSym = mlir::getAffineSymbolExpr(1, context);
+      auto adjustedExpr = (*expr - lbSym).floorDiv(strideSym);
+      auto map = mlir::AffineMap::get(builder.dims.size(), 2, adjustedExpr);
+      SmallVector<mlir::Value> operands;
+      operands.append(builder.dims.begin(), builder.dims.end());
+      operands.push_back(triples[i * 3]);
+      operands.push_back(triples[i * 3 + 2]);
+      auto adjusted =
+          affine::AffineApplyOp::create(rewriter, loc, map, operands);
+      adjustedIndices.push_back(adjusted.getResult());
+    }
   }
-}
 
-static void populateIndexArgs(fir::ArrayCoorOp acoOp,
-                              SmallVectorImpl<mlir::Value> &indexArgs,
-                              mlir::PatternRewriter &rewriter) {
-  if (auto shape = acoOp.getShape().getDefiningOp<ShapeOp>())
-    return populateIndexArgs(acoOp, shape, indexArgs, rewriter);
-  if (auto shapeShift = acoOp.getShape().getDefiningOp<ShapeShiftOp>())
-    return populateIndexArgs(acoOp, shapeShift, indexArgs, rewriter);
-  if (auto slice = acoOp.getShape().getDefiningOp<SliceOp>())
-    return populateIndexArgs(acoOp, slice, indexArgs, rewriter);
-}
+  // need reverse because memref is row major order but fir.array is column
+  // major order
+  std::reverse(adjustedIndices.begin(), adjustedIndices.end());
 
-/// Returns affine.apply and fir.convert from array_coor and gendims
-static std::pair<affine::AffineApplyOp, fir::ConvertOp>
-createAffineOps(mlir::Value arrayRef, mlir::PatternRewriter &rewriter) {
-  auto acoOp = arrayRef.getDefiningOp<ArrayCoorOp>();
-  auto affineMap =
-      createArrayIndexAffineMap(acoOp.getIndices().size(), acoOp.getContext());
-  SmallVector<mlir::Value> indexArgs;
-  indexArgs.append(acoOp.getIndices().begin(), acoOp.getIndices().end());
-
-  populateIndexArgs(acoOp, indexArgs, rewriter);
-
-  auto affineApply = affine::AffineApplyOp::create(rewriter, acoOp.getLoc(),
-                                                   affineMap, indexArgs);
-  auto arrayElementType = coordinateArrayElement(acoOp);
-  auto newType =
-      mlir::MemRefType::get({mlir::ShapedType::kDynamic}, arrayElementType);
-  auto arrayConvert = fir::ConvertOp::create(rewriter, acoOp.getLoc(), newType,
-                                             acoOp.getMemref());
-  return std::make_pair(affineApply, arrayConvert);
+  return {std::move(adjustedIndices), arrayConvert};
 }
 
 static void rewriteLoad(fir::LoadOp loadOp, mlir::PatternRewriter &rewriter) {
   rewriter.setInsertionPoint(loadOp);
-  auto affineOps = createAffineOps(loadOp.getMemref(), rewriter);
+  auto result = createMultiDimAffineOps(loadOp.getMemref(), rewriter);
   rewriter.replaceOpWithNewOp<affine::AffineLoadOp>(
-      loadOp, affineOps.second.getResult(), affineOps.first.getResult());
+      loadOp, result.arrayConvert.getResult(), result.indices);
 }
 
 static void rewriteStore(fir::StoreOp storeOp,
                          mlir::PatternRewriter &rewriter) {
   rewriter.setInsertionPoint(storeOp);
-  auto affineOps = createAffineOps(storeOp.getMemref(), rewriter);
+  auto result = createMultiDimAffineOps(storeOp.getMemref(), rewriter);
   rewriter.replaceOpWithNewOp<affine::AffineStoreOp>(
-      storeOp, storeOp.getValue(), affineOps.second.getResult(),
-      affineOps.first.getResult());
+      storeOp, storeOp.getValue(), result.arrayConvert.getResult(),
+      result.indices);
 }
 
 static void rewriteMemoryOps(Block *block, mlir::PatternRewriter &rewriter) {
@@ -478,6 +559,19 @@ class AffineLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
         functionAnalysis.getChildLoopAnalysis(loop);
     if (!loopAnalysis.canPromoteToAffine())
       return rewriter.notifyMatchFailure(loop, "cannot promote to affine");
+
+    // All enclosing fir.do_loop ops must also be promotable.  Otherwise
+    // this loop's affine operations would reference fir.do_loop block args
+    // (not affine.for IVs) as dimension ids, which is invalid.
+    for (auto *parent = loop->getParentOp(); parent;
+         parent = parent->getParentOp()) {
+      if (auto parentLoop = dyn_cast<fir::DoLoopOp>(parent)) {
+        auto parentAnalysis = functionAnalysis.getChildLoopAnalysis(parentLoop);
+        if (!parentAnalysis.canPromoteToAffine())
+          return rewriter.notifyMatchFailure(
+              loop, "enclosing fir.do_loop is not promotable");
+      }
+    }
     auto &loopOps = loop.getBody()->getOperations();
     auto resultOp = cast<fir::ResultOp>(loop.getBody()->getTerminator());
     auto results = resultOp.getOperands();
@@ -525,18 +619,39 @@ class AffineLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
     return genericBounds(op, rewriter);
   }
 
+  /// Build an AffineMap + operands for a single loop bound using
+  /// AffineIndexBuilder.  Reuses the same recursive decomposition used for
+  /// array indices: fir.convert, arith.addi/subi/muli, constants, and
+  /// enclosing loop IVs are all handled uniformly.
+  ///
+  /// If the bound is an affine expression of enclosing loop IVs and
+  /// constants, those IVs become dimensions in the map (as required by the
+  /// affine verifier).  Otherwise the raw value is treated as a symbol.
+  static mlir::AffineMap boundMap(mlir::Value operand, int64_t offset,
+                                  mlir::MLIRContext *ctx,
+                                  SmallVectorImpl<mlir::Value> &mapOperands) {
+    AffineIndexBuilder builder(ctx);
+    if (auto expr = builder.build(operand)) {
+      mapOperands.append(builder.dims.begin(), builder.dims.end());
+      return mlir::AffineMap::get(builder.dims.size(), /*symbolCount=*/0,
+                                  *expr + offset);
+    }
+    mapOperands.push_back(operand);
+    return mlir::AffineMap::get(/*dimCount=*/0, /*symbolCount=*/1,
+                                mlir::getAffineSymbolExpr(0, ctx) + offset);
+  }
+
   // when step for the loop is positive compile time constant
   std::pair<affine::AffineForOp, mlir::Value>
   positiveConstantStep(fir::DoLoopOp op, int64_t step,
                        mlir::PatternRewriter &rewriter) const {
+    auto *ctx = op.getContext();
+    SmallVector<mlir::Value> lbOperands, ubOperands;
+    auto lbMap = boundMap(op.getLowerBound(), 0, ctx, lbOperands);
+    auto ubMap = boundMap(op.getUpperBound(), 1, ctx, ubOperands);
     auto affineFor = affine::AffineForOp::create(
-        rewriter, op.getLoc(), ValueRange(op.getLowerBound()),
-        mlir::AffineMap::get(0, 1,
-                             mlir::getAffineSymbolExpr(0, op.getContext())),
-        ValueRange(op.getUpperBound()),
-        mlir::AffineMap::get(0, 1,
-                             1 + mlir::getAffineSymbolExpr(0, op.getContext())),
-        step, op.getIterOperands());
+        rewriter, op.getLoc(), lbOperands, lbMap, ubOperands, ubMap, step,
+        op.getIterOperands());
     return std::make_pair(affineFor, affineFor.getInductionVar());
   }
 
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 5a3059ebbd97f..ab64371e756f0 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_flang_library(FIRTransforms
   AddDebugInfo.cpp
   AffineDemotion.cpp
   AffinePromotion.cpp
+  SimplifyDoLoop.cpp
   AlgebraicSimplification.cpp
   AnnotateConstant.cpp
   ArrayValueCopy.cpp
diff --git a/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
new file mode 100644
index 0000000000000..12f3c5a9a1f91
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
@@ -0,0 +1,639 @@
+//===-- SimplifyDoLoop.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// General-purpose FIR loop canonicalization pass.
+//
+// Transforms fir.do_loop nests into a canonical form suitable for affine
+// promotion and loop optimizations (tiling, fusion, interchange, etc.).
+//
+// The canonical form has:
+//   - No iter_args (shadow induction variable copies removed)
+//   - No memory-based IV tracking inside the loop body
+//   - Final IV values computed and stored after the outermost loop
+//
+// === Design Overview ===
+//
+// Analysis phase (per loop nest):
+//   1. Collect perfectly nested fir.do_loop chain.
+//   2. For each loop, verify iter_arg is a shadow of the induction variable:
+//      - init = fir.convert(lower_bound)
+//      - yield = arith.addi(iter_arg_or_load_of_iv, fir.convert(step))
+//   3. Verify safety conditions:
+//      a. Only one store to IV alloca inside loop (the init store of iter_arg)
+//      b. No function/subroutine calls in the nest
+//      c. IV alloca does not escape (only load/store/declare users)
+//      d. Loop results are only used for final IV stores
+//
+// Transformation phase:
+//   1. For each loop (innermost first):
+//      a. Remove the initial store (fir.store %iter_arg to %iv_alloca)
+//      b. Forward all loads of IV alloca inside loop body to fir.convert(IV)
+//      todo: the forwarding of load of iv alloca can be done by some other pass
+//      like fir-memref-dataflow-opt pass (if it is available). 
+//      c. Strip iter_args and fir.result, rebuild as simple fir.do_loop
+//   2. After the outermost loop, compute and store final IV values
+//      for all loops whose IV is live after the loop (outer to inner order).
+//      Fortran final value: final_iv = lb + ((ub - lb + step) / step) * step
+//      which equals the value of the iter_arg after the last increment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+
+namespace fir {
+#define GEN_PASS_DEF_SIMPLIFYDOLOOP
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+#define DEBUG_TYPE "simplify-do-loop"
+
+using namespace fir;
+using namespace mlir;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Per-loop bookkeeping built during analysis
+//===----------------------------------------------------------------------===//
+
+struct LoopIVInfo {
+  fir::DoLoopOp loop;
+  Value ivAlloca;                  // fir.alloca for this loop's IV
+  SmallVector<Value, 2> ivAliases; // ivAlloca + any fir.declare alias
+  Value lowerBound;                // index-typed lower bound
+  Value upperBound;                // index-typed upper bound
+  Value step;                      // index-typed step
+  Type ivType;                     // Fortran IV type (e.g. i32)
+};
+
+//===----------------------------------------------------------------------===//
+// Helpers
+//===----------------------------------------------------------------------===//
+
+/// Collect the IV memory reference and all its aliases (the raw fir.alloca
+/// and any fir.declare results that alias it).  `ivRef` may be either the
+/// alloca itself or a declare result — we normalise to the underlying alloca
+/// first, then collect all declare aliases from it.
+static SmallVector<Value, 2> collectAliases(Value ivRef) {
+  SmallVector<Value, 2> aliases;
+
+  // If ivRef is a declare result, trace back to the underlying alloca.
+  Value underlying = ivRef;
+  if (auto decl = ivRef.getDefiningOp<fir::DeclareOp>())
+    underlying = decl.getMemref();
+
+  aliases.push_back(underlying);
+  for (auto *user : underlying.getUsers())
+    if (auto decl = dyn_cast<fir::DeclareOp>(user))
+      aliases.push_back(decl.getResult());
+
+  return aliases;
+}
+
+/// Collect a perfectly nested chain of fir.do_loop ops starting from `outer`.
+/// A loop is considered perfectly nested if between each nesting level only
+/// IV-related operations (stores, converts) and the inner loop exist.
+static SmallVector<fir::DoLoopOp> collectNest(fir::DoLoopOp outer) {
+  SmallVector<fir::DoLoopOp> nest;
+  fir::DoLoopOp cur = outer;
+  while (cur) {
+    nest.push_back(cur);
+    fir::DoLoopOp inner;
+    unsigned loopCount = 0;
+    for (auto &op : cur.getBody()->getOperations())
+      if (auto nested = dyn_cast<fir::DoLoopOp>(op)) {
+        inner = nested;
+        ++loopCount;
+      }
+    if (loopCount != 1)
+      break;
+    cur = inner;
+  }
+  return nest;
+}
+
+/// Strip fir.convert chains to find the root SSA value.
+static Value stripConverts(Value val) {
+  while (auto conv = val.getDefiningOp<fir::ConvertOp>())
+    val = conv.getValue();
+  return val;
+}
+
+/// Check whether `val` originates from `target` (possibly through fir.convert).
+static bool originatesFrom(Value val, Value target) {
+  return stripConverts(val) == target;
+}
+
+/// Find IV alloca: the first fir.store in the loop body whose value
+/// originates from the iter_arg or the induction variable (possibly through
+/// fir.convert chains).
+// ***** We scan the entire top-level body rather than
+/// stopping at an inner fir.do_loop so that the pass remains robust if
+/// upstream passes reorder operations.
+static Value findIVAlloca(fir::DoLoopOp loop) {
+  if (!loop.hasIterOperands() || loop.getNumIterOperands() < 1)
+    return {};
+  auto iterArg = loop.getRegionIterArgs()[0];
+  auto iv = loop.getInductionVar();
+  for (auto &op : loop.getBody()->getOperations()) {
+    if (auto store = dyn_cast<fir::StoreOp>(op)) {
+      Value stored = store.getValue();
+      if (originatesFrom(stored, iterArg) || originatesFrom(stored, iv))
+        return store.getMemref();
+    }
+  }
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+//                          ANALYSIS PHASE
+//===----------------------------------------------------------------------===//
+
+// ---- Analysis 1: Confirm iter_arg is a shadow of the induction variable ----
+//
+// The iter_arg must mirror the index-typed induction variable:
+//   init  = fir.convert(lower_bound) : (index) -> i32
+//   yield = arith.addi(iter_arg_or_load_of_iv, fir.convert(step))
+
+static bool isShadowIV(fir::DoLoopOp loop, Value ivAlloca) {
+  auto iterOperands = loop.getIterOperands();
+  auto iterArg = loop.getRegionIterArgs()[0];
+
+  auto initConvert = iterOperands[0].getDefiningOp<fir::ConvertOp>();
+  if (!initConvert || initConvert.getValue() != loop.getLowerBound()) {
+    LLVM_DEBUG(llvm::dbgs() << "  [shadow] init is not fir.convert(lb)\n");
+    return false;
+  }
+
+  auto resultOp = cast<fir::ResultOp>(loop.getBody()->getTerminator());
+  auto addOp = resultOp.getOperand(0).getDefiningOp<arith::AddIOp>();
+  if (!addOp) {
+    LLVM_DEBUG(llvm::dbgs() << "  [shadow] yield is not arith.addi\n");
+    return false;
+  }
+
+  auto isIVValue = [&](Value v) -> bool {
+    if (v == iterArg)
+      return true;
+    if (auto load = v.getDefiningOp<fir::LoadOp>()) {
+      if (load.getMemref() == ivAlloca)
+        return true;
+      if (auto decl = load.getMemref().getDefiningOp<fir::DeclareOp>())
+        if (decl.getMemref() == ivAlloca)
+          return true;
+    }
+    return false;
+  };
+
+  Value stepSide;
+  if (isIVValue(addOp.getLhs()))
+    stepSide = addOp.getRhs();
+  else if (isIVValue(addOp.getRhs()))
+    stepSide = addOp.getLhs();
+  else {
+    LLVM_DEBUG(llvm::dbgs() << "  [shadow] addi doesn't use iter_arg/IV\n");
+    return false;
+  }
+
+  auto stepConvert = stepSide.getDefiningOp<fir::ConvertOp>();
+  if (!stepConvert || stepConvert.getValue() != loop.getStep()) {
+    LLVM_DEBUG(llvm::dbgs() << "  [shadow] step operand mismatch\n");
+    return false;
+  }
+  return true;
+}
+
+// ---- Analysis 2: Only one store to IV alloca inside loop (the init store) --
+
+static bool singleStoreToIVAlloca(fir::DoLoopOp loop,
+                                  ArrayRef<Value> ivAliases) {
+  auto iterArg = loop.getRegionIterArgs()[0];
+  auto iv = loop.getInductionVar();
+  bool foundInit = false;
+  bool ok = true;
+
+  loop.walk([&](fir::StoreOp store) {
+    if (!llvm::is_contained(ivAliases, store.getMemref()))
+      return;
+    if (!foundInit && (originatesFrom(store.getValue(), iterArg) ||
+                       originatesFrom(store.getValue(), iv))) {
+      foundInit = true;
+      return;
+    }
+    LLVM_DEBUG(llvm::dbgs()
+               << "  [store] extra store to IV: " << store << "\n");
+    ok = false;
+  });
+  return ok;
+}
+
+// ---- Analysis 3: No function/subroutine calls in the nest -----------------
+
+static bool noCallsInNest(fir::DoLoopOp outermost) {
+  bool ok = true;
+  outermost.walk([&](Operation *op) {
+    if (isa<fir::CallOp>(op) || isa<func::CallOp>(op) ||
+        isa<fir::DispatchOp>(op)) {
+      LLVM_DEBUG(llvm::dbgs() << "  [call] found: " << *op << "\n");
+      ok = false;
+    }
+  });
+  return ok;
+}
+
+// ---- Analysis 4: IV alloca must not escape --------------------------------
+
+static bool ivDoesNotEscape(ArrayRef<Value> ivAliases) {
+  for (auto alias : ivAliases)
+    for (auto *user : alias.getUsers())
+      if (!isa<fir::StoreOp, fir::LoadOp, fir::DeclareOp>(user)) {
+        LLVM_DEBUG(llvm::dbgs() << "  [escape] IV escapes: " << *user << "\n");
+        return false;
+      }
+  return true;
+}
+
+// ---- Full nest analysis ---------------------------------------------------
+
+static bool analyzeNest(SmallVector<LoopIVInfo> &infos) {
+  // --- Per-loop: shadow-IV check, IV alloca discovery, single-store check ---
+  for (auto &info : infos) {
+    auto loop = info.loop;
+    if (!loop.hasIterOperands() || loop.getNumIterOperands() != 1) {
+      LLVM_DEBUG(llvm::dbgs() << "  skip: loop has != 1 iter_args at "
+                              << loop.getLoc() << "\n");
+      return false;
+    }
+
+    info.ivAlloca = findIVAlloca(loop);
+    if (!info.ivAlloca) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  cannot find IV alloca at " << loop.getLoc() << "\n");
+      return false;
+    }
+
+    info.ivAliases = collectAliases(info.ivAlloca);
+
+    if (!isShadowIV(loop, info.ivAlloca)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  not shadow IV at " << loop.getLoc() << "\n");
+      return false;
+    }
+
+    if (!singleStoreToIVAlloca(loop, info.ivAliases)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  multiple stores at " << loop.getLoc() << "\n");
+      return false;
+    }
+
+    // Record loop bounds and IV type from the iter_arg init value.
+    info.lowerBound = loop.getLowerBound();
+    info.upperBound = loop.getUpperBound();
+    info.step = loop.getStep();
+    info.ivType = loop.getIterOperands()[0].getType();
+  }
+
+  // --- No function calls in the nest ---
+  if (!noCallsInNest(infos.front().loop))
+    return false;
+
+  // --- IV alloca must not escape ---
+  for (auto &info : infos) {
+    if (!ivDoesNotEscape(info.ivAliases))
+      return false;
+  }
+
+  // --- Loop results must only be used for final IV stores ---
+  for (auto &info : infos) {
+    for (auto result : info.loop.getResults()) {
+      for (auto *user : result.getUsers()) {
+        auto store = dyn_cast<fir::StoreOp>(user);
+        if (!store || !llvm::is_contained(info.ivAliases, store.getMemref())) {
+          LLVM_DEBUG(llvm::dbgs()
+                     << "  [result] loop result used outside IV store at "
+                     << info.loop.getLoc() << ": " << *user << "\n");
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                       TRANSFORMATION PHASE
+//===----------------------------------------------------------------------===//
+
+/// Ensure a value is available (dominates) at the current insertion point.
+/// If the value is already defined outside `outermost`, return it directly.
+/// Otherwise, rematerialize the computation by cloning through simple ops
+/// (fir.convert, fir.load, arith constants).
+///
+/// `ivFinalMap` maps loop induction variables (block arguments) to their
+/// already-computed final index values.  This allows inner loop bounds that
+/// depend on outer IVs (e.g. triangular loops) to be correctly resolved.
+static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
+                                  OpBuilder &builder, Location loc,
+                                  const DenseMap<Value, Value> &ivFinalMap) {
+  // Already defined outside the outermost loop — use directly.
+  if (auto blockArg = dyn_cast<BlockArgument>(val)) {
+    if (!outermost->isAncestor(blockArg.getOwner()->getParentOp()))
+      return val;
+    auto it = ivFinalMap.find(val);
+    if (it != ivFinalMap.end())
+      return it->second;
+    return val;
+  }
+  if (auto *defOp = val.getDefiningOp()) {
+    if (!outermost->isAncestor(defOp))
+      return val;
+  }
+
+  auto *defOp = val.getDefiningOp();
+  if (!defOp)
+    return val;
+
+  // fir.convert: rematerialize the input, then re-emit the convert.
+  if (auto conv = dyn_cast<fir::ConvertOp>(*defOp)) {
+    auto newInput = rematerializeOutside(conv.getValue(), outermost, builder,
+                                         loc, ivFinalMap);
+    return fir::ConvertOp::create(builder, loc, conv.getType(), newInput);
+  }
+
+  // fir.load: the address must already be outside (alloca/declare/etc).
+  if (auto load = dyn_cast<fir::LoadOp>(*defOp)) {
+    auto addr = rematerializeOutside(load.getMemref(), outermost, builder, loc,
+                                     ivFinalMap);
+    return fir::LoadOp::create(builder, loc, addr);
+  }
+
+  // arith.constant: just clone it.
+  if (isa<arith::ConstantOp>(*defOp)) {
+    auto *cloned = builder.clone(*defOp);
+    return cloned->getResult(0);
+  }
+
+  // Arithmetic ops (addi, subi, muli, divsi, cmpi, select): rematerialize
+  // all operands recursively, then clone the op with new operands.
+  if (isa<arith::AddIOp, arith::SubIOp, arith::MulIOp, arith::DivSIOp,
+          arith::CmpIOp, arith::SelectOp>(*defOp)) {
+    SmallVector<Value> newOperands;
+    for (auto operand : defOp->getOperands())
+      newOperands.push_back(
+          rematerializeOutside(operand, outermost, builder, loc, ivFinalMap));
+    auto *cloned = builder.clone(*defOp);
+    for (unsigned i = 0; i < newOperands.size(); ++i)
+      cloned->setOperand(i, newOperands[i]);
+    return cloned->getResult(0);
+  }
+
+  // For anything else, assume it's already available.
+  return val;
+}
+
+/// Compute the Fortran final IV value and store it to the IV alloca.
+///
+/// Fortran DO loop semantics: after normal completion, the IV holds the
+/// value it would have received on the iteration that causes termination.
+/// For `DO I = lb, ub, step`:
+///   trip_count = MAX((ub - lb + step) / step, 0)
+///   final_iv   = lb + trip_count * step
+///
+/// Since the loop actually executed (we wouldn't reach here otherwise for
+/// an empty nest), we use the FIR loop's own bounds which are already
+/// index-typed. We compute:
+///   final_index = lb + ((ub - lb + step) / step) * step
+/// Then convert to the Fortran IV type (e.g. i32) and store.
+///
+/// `ivFinalMap` is populated with the mapping from this loop's IV (block arg)
+/// to its *last iteration value* (finalIndex - step).  Inner loops whose
+/// bounds depend on an outer IV need the value from the last iteration, not
+/// the Fortran final value (which is one step past the last iteration).
+/// Example: for `do i=1,100; do j=1,i`, j's final value must be computed
+/// with i=100 (last iteration), not i=101 (Fortran final).
+static void emitFinalIVStore(OpBuilder &builder, Location loc, LoopIVInfo &info,
+                             fir::DoLoopOp outermost,
+                             DenseMap<Value, Value> &ivFinalMap) {
+  // Rematerialize bounds outside the outermost loop if needed.
+  // For inner loops with IV-dependent bounds (e.g. do j=1,i), the outer IV
+  // block argument will be resolved via ivFinalMap.
+  Value lb = rematerializeOutside(info.lowerBound, outermost, builder, loc,
+                                  ivFinalMap);
+  Value ub = rematerializeOutside(info.upperBound, outermost, builder, loc,
+                                  ivFinalMap);
+  Value step =
+      rematerializeOutside(info.step, outermost, builder, loc, ivFinalMap);
+
+  // trip_count = (ub - lb + step) / step
+  Value ubMinusLb = arith::SubIOp::create(builder, loc, ub, lb);
+  Value ubMinusLbPlusStep =
+      arith::AddIOp::create(builder, loc, ubMinusLb, step);
+  Value tripCount =
+      arith::DivSIOp::create(builder, loc, ubMinusLbPlusStep, step);
+
+  // Clamp trip count to >= 0.
+  Value zero = arith::ConstantIndexOp::create(builder, loc, 0);
+  Value isPositive = arith::CmpIOp::create(
+      builder, loc, arith::CmpIPredicate::sgt, tripCount, zero);
+  Value clampedTrip =
+      arith::SelectOp::create(builder, loc, isPositive, tripCount, zero);
+
+  // final_index = lb + trip_count * step
+  Value tripTimesStep = arith::MulIOp::create(builder, loc, clampedTrip, step);
+  Value finalIndex = arith::AddIOp::create(builder, loc, lb, tripTimesStep);
+
+  // Record the *last iteration* value (finalIndex - step) for this IV so
+  // that inner loops whose bounds depend on this IV use the correct value.
+  // Fortran final value = lb + trip_count * step (one step PAST the last
+  // iteration), but the inner loop's last execution sees the outer IV at
+  // lb + (trip_count - 1) * step.
+  Value lastIterValue = arith::SubIOp::create(builder, loc, finalIndex, step);
+  ivFinalMap[info.loop.getInductionVar()] = lastIterValue;
+
+  // Convert from index to the Fortran IV type (e.g. i32).
+  Value finalIV = fir::ConvertOp::create(builder, loc, info.ivType, finalIndex);
+
+  // Store to the IV alloca.
+  fir::StoreOp::create(builder, loc, finalIV, info.ivAlloca);
+
+  LLVM_DEBUG(llvm::dbgs() << "  emitted final IV store for " << info.ivAlloca
+                          << " at " << loc << "\n");
+}
+
+/// Transform one loop: remove init/final stores, forward IV loads, strip
+/// iter_args, and rebuild as a simple fir.do_loop.
+static fir::DoLoopOp transformOneLoop(fir::DoLoopOp loop,
+                                      ArrayRef<Value> ivAliases,
+                                      OpBuilder &builder) {
+  auto loc = loop.getLoc();
+  auto iv = loop.getInductionVar();
+  auto iterArg = loop.getRegionIterArgs()[0];
+
+  LLVM_DEBUG(llvm::dbgs() << "  transforming loop at " << loc << "\n");
+
+  // Identify the increment addi (yielded by fir.result).
+  auto resultOp = cast<fir::ResultOp>(loop.getBody()->getTerminator());
+  Operation *incrementOp = nullptr;
+  if (auto addOp = resultOp.getOperand(0).getDefiningOp<arith::AddIOp>())
+    incrementOp = addOp;
+
+  // --- Remove initial store to IV alloca ---
+  // The init store may be:  fir.store %iterArg to %alloca
+  //                    or:  fir.store (fir.convert %iterArg) to %alloca
+  //                    or:  fir.store (fir.convert %iv) to %alloca
+  // Scan the loop body (before any inner loop) and erase the first store
+  // to any IV alias whose value originates from iterArg or the IV.
+  for (auto &op : llvm::make_early_inc_range(*loop.getBody())) {
+    if (auto store = dyn_cast<fir::StoreOp>(op)) {
+      if (llvm::is_contained(ivAliases, store.getMemref()) &&
+          (originatesFrom(store.getValue(), iterArg) ||
+           originatesFrom(store.getValue(), iv))) {
+        // Any dead fir.convert chain feeding this store will be cleaned up
+        // by the subsequent canonicalize pass in the pipeline.
+        store.erase();
+        break; // only remove the first (init) store
+      }
+    }
+  }
+
+  // --- Remove final store: fir.store %loop_result to %iv_alloca ---
+  for (auto result : loop.getResults()) {
+    for (auto *user : llvm::make_early_inc_range(result.getUsers()))
+      if (auto store = dyn_cast<fir::StoreOp>(user))
+        if (llvm::is_contained(ivAliases, store.getMemref()))
+          store.erase();
+  }
+
+  // --- Forward loads of IV alloca anywhere inside loop → fir.convert(IV) ---
+  // The initial store was removed, so loads of the IV alloca inside the
+  // loop (including nested loops) now need to read from the index-typed
+  // induction variable (converted to the IV's Fortran type).
+  loop.walk([&](fir::LoadOp load) {
+    if (llvm::is_contained(ivAliases, load.getMemref())) {
+      builder.setInsertionPoint(load);
+      auto ivCast = fir::ConvertOp::create(builder, loc, load.getType(), iv);
+      load.getResult().replaceAllUsesWith(ivCast);
+      load.erase();
+    }
+  });
+
+  // --- Replace remaining iter_arg uses with fir.convert(IV) ---
+  {
+    SmallVector<OpOperand *> uses;
+    for (auto &use : iterArg.getUses())
+      uses.push_back(&use);
+
+    for (auto *use : uses) {
+      if (use->getOwner() == incrementOp)
+        continue;
+      builder.setInsertionPoint(use->getOwner());
+      auto ivCast = fir::ConvertOp::create(builder, loc, iterArg.getType(), iv);
+      use->set(ivCast);
+    }
+  }
+
+  // --- Clear fir.result operands ---
+  auto *terminator = loop.getBody()->getTerminator();
+  terminator->eraseOperands(0, terminator->getNumOperands());
+
+  // Erase the increment addi (its result was the fir.result operand).
+  if (incrementOp && incrementOp->use_empty())
+    incrementOp->erase();
+
+  // --- Rebuild loop without iter_args ---
+  builder.setInsertionPoint(loop);
+  auto newLoop = fir::DoLoopOp::create(builder, loc, loop.getLowerBound(),
+                                       loop.getUpperBound(), loop.getStep());
+  loop.getInductionVar().replaceAllUsesWith(newLoop.getInductionVar());
+
+  auto &oldOps = loop.getBody()->getOperations();
+  auto &newOps = newLoop.getBody()->getOperations();
+  newOps.splice(newOps.begin(), oldOps, oldOps.begin(),
+                std::prev(oldOps.end()));
+
+  loop.erase();
+  return newLoop;
+}
+
+//===----------------------------------------------------------------------===//
+// Pass entry
+//===----------------------------------------------------------------------===//
+
+class SimplifyDoLoop : public fir::impl::SimplifyDoLoopBase<SimplifyDoLoop> {
+public:
+  void runOnOperation() override {
+    auto func = getOperation();
+
+    // Collect all outermost fir.do_loop ops.
+    SmallVector<fir::DoLoopOp> outerLoops;
+    func.walk([&](fir::DoLoopOp loop) {
+      if (!loop->getParentOfType<fir::DoLoopOp>())
+        outerLoops.push_back(loop);
+    });
+
+    for (auto outerLoop : outerLoops) {
+      auto nestLoops = collectNest(outerLoop);
+      LLVM_DEBUG(llvm::dbgs()
+                 << "SimplifyDoLoop: nest depth " << nestLoops.size() << " at "
+                 << outerLoop.getLoc() << "\n");
+
+      if (nestLoops.empty()) {
+        LLVM_DEBUG(llvm::dbgs() << "  skip (empty nest)\n");
+        continue;
+      }
+
+      // ======== Analysis Phase ========
+      SmallVector<LoopIVInfo> infos;
+      for (auto loop : nestLoops)
+        infos.push_back({loop, {}, {}, {}, {}, {}, {}});
+
+      if (!analyzeNest(infos)) {
+        LLVM_DEBUG(llvm::dbgs() << "  nest rejected by analysis\n");
+        continue;
+      }
+
+      LLVM_DEBUG(llvm::dbgs() << "  analysis passed — transforming "
+                              << infos.size() << " loops\n");
+
+      // ======== Transformation Phase ========
+      OpBuilder builder(func.getContext());
+
+      for (int i = infos.size() - 1; i >= 0; --i)
+        infos[i].loop =
+            transformOneLoop(infos[i].loop, infos[i].ivAliases, builder);
+
+      // ---- After the outermost loop, emit final IV value stores. ----
+      //         Process outer-to-inner so that outer IV final values are
+      //         available when computing inner IV finals (e.g. triangular
+      //         loops where inner bounds depend on outer IVs).
+      fir::DoLoopOp outermostNew = infos.front().loop;
+      builder.setInsertionPointAfter(outermostNew);
+
+      DenseMap<Value, Value> ivFinalMap;
+      for (auto &info : infos)
+        emitFinalIVStore(builder, outermostNew.getLoc(), info, outermostNew,
+                         ivFinalMap);
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::createSimplifyDoLoopPass() {
+  return std::make_unique<SimplifyDoLoop>();
+}
diff --git a/flang/test/Fir/affine-demotion.fir b/flang/test/Fir/affine-demotion.fir
index bdb84be3624cb..635784c5e4bb9 100644
--- a/flang/test/Fir/affine-demotion.fir
+++ b/flang/test/Fir/affine-demotion.fir
@@ -4,27 +4,31 @@
 
 #map0 = affine_map<()[s0, s1] -> (s1 - s0 + 1)>
 #map1 = affine_map<()[s0] -> (s0 + 1)>
-#map2 = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 - s0)>
+#map2 = affine_map<(d0) -> (d0 - 1)>
 module  {
   func.func @calc(%arg0: !fir.ref<!fir.array<?xf32>>, %arg1: !fir.ref<!fir.array<?xf32>>, %arg2: !fir.ref<!fir.array<?xf32>>) {
     %c1 = arith.constant 1 : index
     %c100 = arith.constant 100 : index
     %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+    %a = fir.declare %arg0(%0) {uniq_name = "a"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
+    %b = fir.declare %arg1(%0) {uniq_name = "b"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
     %1 = affine.apply #map0()[%c1, %c100]
     %2 = fir.alloca !fir.array<?xf32>, %1
-    %3 = fir.convert %arg0 : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-    %4 = fir.convert %arg1 : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-    %5 = fir.convert %2 : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
+    %t = fir.declare %2(%0) {uniq_name = "t"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
+    %3 = fir.convert %a : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
+    %4 = fir.convert %b : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
+    %5 = fir.convert %t : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
     affine.for %arg3 = %c1 to #map1()[%c100] {
-      %7 = affine.apply #map2(%arg3)[%c1, %c100, %c1]
+      %7 = affine.apply #map2(%arg3)
       %8 = affine.load %3[%7] : memref<?xf32>
       %9 = affine.load %4[%7] : memref<?xf32>
       %10 = arith.addf %8, %9 : f32
       affine.store %10, %5[%7] : memref<?xf32>
     }
-    %6 = fir.convert %arg2 : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
+    %c = fir.declare %arg2(%0) {uniq_name = "c"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
+    %6 = fir.convert %c : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
     affine.for %arg3 = %c1 to #map1()[%c100] {
-      %7 = affine.apply #map2(%arg3)[%c1, %c100, %c1]
+      %7 = affine.apply #map2(%arg3)
       %8 = affine.load %5[%7] : memref<?xf32>
       %9 = affine.load %4[%7] : memref<?xf32>
       %10 = arith.mulf %8, %9 : f32
@@ -34,35 +38,135 @@ module  {
   }
 }
 
-// CHECK:  func @calc(%[[VAL_0:.*]]: !fir.ref<!fir.array<?xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<?xf32>>, %[[VAL_2:.*]]: !fir.ref<!fir.array<?xf32>>) {
-// CHECK:    %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK:    %[[VAL_4:.*]] = arith.constant 100 : index
-// CHECK:    %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-// CHECK:    %[[VAL_6:.*]] = arith.constant 100 : index
-// CHECK:    %[[VAL_7:.*]] = fir.alloca !fir.array<?xf32>, %[[VAL_6]]
-// CHECK:    %[[VAL_8:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-// CHECK:    %[[VAL_9:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-// CHECK:    %[[VAL_10:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-// CHECK:    affine.for %[[VAL_11:.*]] = 1 to 101 {
-// CHECK:      %[[VAL_12:.*]] = affine.apply #map(%[[VAL_11]]){{\[}}%[[VAL_3]], %[[VAL_4]], %[[VAL_3]]]
-// CHECK:      %[[VAL_13:.*]] = fir.coordinate_of %[[VAL_8]], %[[VAL_12]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      %[[VAL_14:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
-// CHECK:      %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_9]], %[[VAL_12]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<f32>
-// CHECK:      %[[VAL_17:.*]] = arith.addf %[[VAL_14]], %[[VAL_16]] : f32
-// CHECK:      %[[VAL_18:.*]] = fir.coordinate_of %[[VAL_10]], %[[VAL_12]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      fir.store %[[VAL_17]] to %[[VAL_18]] : !fir.ref<f32>
+// CHECK:  func @calc(%[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<?xf32>>, %[[ARG2:.*]]: !fir.ref<!fir.array<?xf32>>) {
+// CHECK:    %[[C1:.*]] = arith.constant 1 : index
+// CHECK:    %[[C100:.*]] = arith.constant 100 : index
+// CHECK:    %[[SHP:.*]] = fir.shape %[[C100]] : (index) -> !fir.shape<1>
+// CHECK:    %[[A:.*]] = fir.declare %[[ARG0]](%[[SHP]]) {uniq_name = "a"}
+// CHECK:    %[[B:.*]] = fir.declare %[[ARG1]](%[[SHP]]) {uniq_name = "b"}
+// CHECK:    %[[ALLOCSZ:.*]] = arith.constant 100 : index
+// CHECK:    %[[ALLOC:.*]] = fir.alloca !fir.array<?xf32>, %[[ALLOCSZ]]
+// CHECK:    %[[T:.*]] = fir.declare %[[ALLOC]](%[[SHP]]) {uniq_name = "t"}
+// fir.convert removed — affine.load/store demoted to fir.array_coor + fir.load/store:
+// CHECK:    affine.for %[[IV1:.*]] = 1 to 101 {
+// CHECK:      %[[IDX1:.*]] = affine.apply #{{.*}}(%[[IV1]])
+// 0-based → 1-based for a(i):
+// CHECK:      %[[C1_A:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_A:.*]] = arith.addi %[[IDX1]], %[[C1_A]] : index
+// CHECK:      %[[A_COOR:.*]] = fir.array_coor %[[A]](%[[SHP]]) %[[FI_A]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      %[[A_VAL:.*]] = fir.load %[[A_COOR]] : !fir.ref<f32>
+// 0-based → 1-based for b(i):
+// CHECK:      %[[C1_B:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_B:.*]] = arith.addi %[[IDX1]], %[[C1_B]] : index
+// CHECK:      %[[B_COOR1:.*]] = fir.array_coor %[[B]](%[[SHP]]) %[[FI_B]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      %[[B_VAL1:.*]] = fir.load %[[B_COOR1]] : !fir.ref<f32>
+// CHECK:      %[[ADD:.*]] = arith.addf %[[A_VAL]], %[[B_VAL1]] : f32
+// 0-based → 1-based for t(i):
+// CHECK:      %[[C1_T:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_T:.*]] = arith.addi %[[IDX1]], %[[C1_T]] : index
+// CHECK:      %[[T_COOR1:.*]] = fir.array_coor %[[T]](%[[SHP]]) %[[FI_T]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      fir.store %[[ADD]] to %[[T_COOR1]] : !fir.ref<f32>
 // CHECK:    }
-// CHECK:    %[[VAL_19:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-// CHECK:    affine.for %[[VAL_20:.*]] = 1 to 101 {
-// CHECK:      %[[VAL_21:.*]] = affine.apply #map(%[[VAL_20]]){{\[}}%[[VAL_3]], %[[VAL_4]], %[[VAL_3]]]
-// CHECK:      %[[VAL_22:.*]] = fir.coordinate_of %[[VAL_10]], %[[VAL_21]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<f32>
-// CHECK:      %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_9]], %[[VAL_21]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<f32>
-// CHECK:      %[[VAL_26:.*]] = arith.mulf %[[VAL_23]], %[[VAL_25]] : f32
-// CHECK:      %[[VAL_27:.*]] = fir.coordinate_of %[[VAL_19]], %[[VAL_21]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      fir.store %[[VAL_26]] to %[[VAL_27]] : !fir.ref<f32>
+// CHECK:    %[[C:.*]] = fir.declare %[[ARG2]](%[[SHP]]) {uniq_name = "c"}
+// CHECK:    affine.for %[[IV2:.*]] = 1 to 101 {
+// CHECK:      %[[IDX2:.*]] = affine.apply #{{.*}}(%[[IV2]])
+// 0-based → 1-based for t(i):
+// CHECK:      %[[C1_T2:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_T2:.*]] = arith.addi %[[IDX2]], %[[C1_T2]] : index
+// CHECK:      %[[T_COOR2:.*]] = fir.array_coor %[[T]](%[[SHP]]) %[[FI_T2]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      %[[T_VAL:.*]] = fir.load %[[T_COOR2]] : !fir.ref<f32>
+// 0-based → 1-based for b(i):
+// CHECK:      %[[C1_B2:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_B2:.*]] = arith.addi %[[IDX2]], %[[C1_B2]] : index
+// CHECK:      %[[B_COOR2:.*]] = fir.array_coor %[[B]](%[[SHP]]) %[[FI_B2]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      %[[B_VAL2:.*]] = fir.load %[[B_COOR2]] : !fir.ref<f32>
+// CHECK:      %[[MUL:.*]] = arith.mulf %[[T_VAL]], %[[B_VAL2]] : f32
+// 0-based → 1-based for c(i):
+// CHECK:      %[[C1_C:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_C:.*]] = arith.addi %[[IDX2]], %[[C1_C]] : index
+// CHECK:      %[[C_COOR:.*]] = fir.array_coor %[[C]](%[[SHP]]) %[[FI_C]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      fir.store %[[MUL]] to %[[C_COOR]] : !fir.ref<f32>
 // CHECK:    }
 // CHECK:    return
 // CHECK:  }
+
+// -----
+
+// Test: 2D nested loop demotion with static-shape arrays.
+#map2 = affine_map<()[s0] -> (s0 + 1)>
+#map3 = affine_map<(d0) -> (d0 - 1)>
+module {
+  func.func @calc_2d_static(%arg0: !fir.ref<!fir.array<100x100xf32>>, %arg1: !fir.ref<!fir.array<100x100xf32>>) {
+    %c1 = arith.constant 1 : index
+    %c100 = arith.constant 100 : index
+    %0 = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+    %1 = fir.convert %arg0 : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+    %2 = fir.convert %arg1 : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+    affine.for %arg2 = %c1 to #map2()[%c100] {
+      %3 = affine.apply #map3(%arg2)
+      affine.for %arg3 = %c1 to #map2()[%c100] {
+        %4 = affine.apply #map3(%arg3)
+        %5 = affine.load %1[%3, %4] : memref<100x100xf32>
+        affine.store %5, %2[%3, %4] : memref<100x100xf32>
+      }
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func @calc_2d_static(
+// CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.array<100x100xf32>>, %[[B:.*]]: !fir.ref<!fir.array<100x100xf32>>)
+// CHECK:    %[[C1:.*]] = arith.constant 1 : index
+// CHECK:    %[[C100:.*]] = arith.constant 100 : index
+// CHECK:    %[[SHP:.*]] = fir.shape %[[C100]], %[[C100]] : (index, index) -> !fir.shape<2>
+// fir.convert removed — static arrays use fir.coordinate_of directly:
+// CHECK:    affine.for %[[I:.*]] = 1 to 101 {
+// CHECK:      %[[IDXI:.*]] = affine.apply #{{.*}}(%[[I]])
+// CHECK:      affine.for %[[J:.*]] = 1 to 101 {
+// CHECK:        %[[IDXJ:.*]] = affine.apply #{{.*}}(%[[J]])
+// Indices reversed (row-major memref → column-major Fortran):
+// CHECK:        %[[A_COOR:.*]] = fir.coordinate_of %[[A]], %[[IDXJ]], %[[IDXI]] : (!fir.ref<!fir.array<100x100xf32>>, index, index) -> !fir.ref<f32>
+// CHECK:        %[[A_VAL:.*]] = fir.load %[[A_COOR]] : !fir.ref<f32>
+// CHECK:        %[[B_COOR:.*]] = fir.coordinate_of %[[B]], %[[IDXJ]], %[[IDXI]] : (!fir.ref<!fir.array<100x100xf32>>, index, index) -> !fir.ref<f32>
+// CHECK:        fir.store %[[A_VAL]] to %[[B_COOR]] : !fir.ref<f32>
+// CHECK:      }
+// CHECK:    }
+// CHECK:    return
+
+// -----
+
+// Test: Triangular loop demotion — inner bound depends on outer IV.
+#map4 = affine_map<()[s0] -> (s0 + 1)>
+#map5 = affine_map<(d0) -> (d0 + 1)>
+#map6 = affine_map<(d0) -> (d0 - 1)>
+module {
+  func.func @triangular_demotion(%arg0: !fir.ref<!fir.array<100x100xf32>>, %arg1: !fir.ref<!fir.array<100x100xf32>>) {
+    %c1 = arith.constant 1 : index
+    %c100 = arith.constant 100 : index
+    %0 = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+    %1 = fir.convert %arg0 : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+    %2 = fir.convert %arg1 : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+    affine.for %arg2 = %c1 to #map4()[%c100] {
+      %3 = affine.apply #map6(%arg2)
+      affine.for %arg3 = %c1 to #map5(%arg2) {
+        %4 = affine.apply #map6(%arg3)
+        %5 = affine.load %1[%3, %4] : memref<100x100xf32>
+        affine.store %5, %2[%3, %4] : memref<100x100xf32>
+      }
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func @triangular_demotion(
+// CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.array<100x100xf32>>, %[[B:.*]]: !fir.ref<!fir.array<100x100xf32>>)
+// Outer: constant bound; Inner: IV-dependent bound
+// CHECK:       affine.for %[[I:.*]] = 1 to 101 {
+// CHECK:         affine.for %[[J:.*]] = 1 to #{{.*}}(%[[I]]) {
+// CHECK:           fir.coordinate_of %[[A]], %{{.*}}, %{{.*}} : (!fir.ref<!fir.array<100x100xf32>>, index, index) -> !fir.ref<f32>
+// CHECK:           fir.load %{{.*}} : !fir.ref<f32>
+// CHECK:           fir.coordinate_of %[[B]], %{{.*}}, %{{.*}} : (!fir.ref<!fir.array<100x100xf32>>, index, index) -> !fir.ref<f32>
+// CHECK:           fir.store %{{.*}} to %{{.*}} : !fir.ref<f32>
+// CHECK:         }
+// CHECK:       }
+// CHECK:       return
diff --git a/flang/test/Fir/affine-promotion.fir b/flang/test/Fir/affine-promotion.fir
index 46467ab4a292a..d48d66cbd8a9f 100644
--- a/flang/test/Fir/affine-promotion.fir
+++ b/flang/test/Fir/affine-promotion.fir
@@ -55,16 +55,16 @@ func.func @loop_with_load_and_store(%a1: !arr_d1, %a2: !arr_d1, %a3: !arr_d1) {
 // CHECK:    %[[VAL_8:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
 // CHECK:    %[[VAL_9:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
 // CHECK:    %[[VAL_10:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-// CHECK:    affine.for %[[VAL_11:.*]] = %[[VAL_3]] to #{{.*}}(){{\[}}%[[VAL_4]]] {
-// CHECK:      %[[VAL_12:.*]] = affine.apply #{{.*}}(%[[VAL_11]]){{\[}}%[[VAL_3]], %[[VAL_4]], %[[VAL_3]]]
+// CHECK:    affine.for %[[VAL_11:.*]] = 1 to 101 {
+// CHECK:      %[[VAL_12:.*]] = affine.apply #{{.*}}(%[[VAL_11]])
 // CHECK:      %[[VAL_13:.*]] = affine.load %[[VAL_8]]{{\[}}%[[VAL_12]]] : memref<?xf32>
 // CHECK:      %[[VAL_14:.*]] = affine.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<?xf32>
 // CHECK:      %[[VAL_15:.*]] = arith.addf %[[VAL_13]], %[[VAL_14]] : f32
 // CHECK:      affine.store %[[VAL_15]], %[[VAL_10]]{{\[}}%[[VAL_12]]] : memref<?xf32>
 // CHECK:    }
 // CHECK:    %[[VAL_16:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-// CHECK:    affine.for %[[VAL_17:.*]] = %[[VAL_3]] to #{{.*}}(){{\[}}%[[VAL_4]]] {
-// CHECK:      %[[VAL_18:.*]] = affine.apply #{{.*}}(%[[VAL_17]]){{\[}}%[[VAL_3]], %[[VAL_4]], %[[VAL_3]]]
+// CHECK:    affine.for %[[VAL_17:.*]] = 1 to 101 {
+// CHECK:      %[[VAL_18:.*]] = affine.apply #{{.*}}(%[[VAL_17]])
 // CHECK:      %[[VAL_19:.*]] = affine.load %[[VAL_10]]{{\[}}%[[VAL_18]]] : memref<?xf32>
 // CHECK:      %[[VAL_20:.*]] = affine.load %[[VAL_9]]{{\[}}%[[VAL_18]]] : memref<?xf32>
 // CHECK:      %[[VAL_21:.*]] = arith.mulf %[[VAL_19]], %[[VAL_20]] : f32
@@ -106,32 +106,30 @@ func.func @loop_with_if(%a: !arr_d1, %v: f32) {
   }
   return
 }
-
 // CHECK: func @loop_with_if(%[[VAL_0:.*]]: !fir.ref<!fir.array<?xf32>>, %[[VAL_1:.*]]: f32) {
-// CHECK:   %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:   %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK:   %[[VAL_4:.*]] = arith.constant 2 : index
 // CHECK:   %[[VAL_5:.*]] = arith.constant 100 : index
 // CHECK:   %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
 // CHECK:   %[[VAL_7:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-// CHECK:   affine.for %[[VAL_8:.*]] = %[[VAL_3]] to #{{.*}}(){{\[}}%[[VAL_5]]] {
-// CHECK:     %[[VAL_9:.*]] = affine.apply #{{.*}}(%[[VAL_8]]){{\[}}%[[VAL_3]], %[[VAL_5]], %[[VAL_3]]]
+// CHECK:   affine.for %[[VAL_8:.*]] = 1 to 101 {
+// CHECK:     %[[VAL_9:.*]] = affine.apply #{{.*}}(%[[VAL_8]])
 // CHECK:     affine.store %[[VAL_1]], %[[VAL_7]]{{\[}}%[[VAL_9]]] : memref<?xf32>
 // CHECK:   }
-// CHECK:   affine.for %[[VAL_10:.*]] = %[[VAL_3]] to #{{.*}}(){{\[}}%[[VAL_5]]] {
-// CHECK:     %[[VAL_11:.*]] = affine.apply #{{.*}}(%[[VAL_10]]){{\[}}%[[VAL_3]], %[[VAL_5]], %[[VAL_3]]]
+// CHECK:   affine.for %[[VAL_10:.*]] = 1 to 101 {
+// CHECK:     %[[VAL_11:.*]] = affine.apply #{{.*}}(%[[VAL_10]])
 // CHECK:     affine.store %[[VAL_1]], %[[VAL_7]]{{\[}}%[[VAL_11]]] : memref<?xf32>
 // CHECK:   }
-// CHECK:   affine.for %[[VAL_12:.*]] = %[[VAL_3]] to #{{.*}}(){{\[}}%[[VAL_5]]] {
+// CHECK:   affine.for %[[VAL_12:.*]] = 1 to 101 {
 // CHECK:     %[[VAL_13:.*]] = arith.subi %[[VAL_12]], %[[VAL_4]] : index
 // CHECK:     affine.if #set(%[[VAL_12]]) {
-// CHECK:       %[[VAL_14:.*]] = affine.apply #{{.*}}(%[[VAL_12]]){{\[}}%[[VAL_3]], %[[VAL_5]], %[[VAL_3]]]
+// CHECK:       %[[VAL_14:.*]] = affine.apply #{{.*}}(%[[VAL_12]])
 // CHECK:       affine.store %[[VAL_1]], %[[VAL_7]]{{\[}}%[[VAL_14]]] : memref<?xf32>
 // CHECK:     }
 // CHECK:   }
 // CHECK:   return
 // CHECK: }
 
+
 func.func @loop_with_result(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.ref<!fir.array<100x100xf32>>, %arg2: !fir.ref<!fir.array<100xf32>>) -> f32 {
   %c1 = arith.constant 1 : index
   %cst = arith.constant 0.000000e+00 : f32
@@ -183,32 +181,35 @@ func.func @loop_with_result(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.re
 // CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 // CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
 // CHECK:           %[[VAL_5:.*]] = fir.alloca i32
-// CHECK:           %[[VAL_6:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<100xf32>>) -> memref<?xf32>
-// CHECK:           %[[VAL_7:.*]] = affine.for %[[VAL_8:.*]] = %[[VAL_0]] to #{{.*}}(){{\[}}%[[VAL_2]]] iter_args(%[[VAL_9:.*]] = %[[VAL_1]]) -> (f32) {
-// CHECK:             %[[VAL_10:.*]] = affine.apply #{{.*}}(%[[VAL_8]]){{\[}}%[[VAL_0]], %[[VAL_2]], %[[VAL_0]]]
-// CHECK:             %[[VAL_11:.*]] = affine.load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref<?xf32>
+// First loop promoted — memref<100xf32> (static shape):
+// CHECK:           %[[VAL_6:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<100xf32>>) -> memref<100xf32>
+// CHECK:           %[[VAL_7:.*]] = affine.for %[[VAL_8:.*]] = 1 to 101 iter_args(%[[VAL_9:.*]] = %[[VAL_1]]) -> (f32) {
+// CHECK:             %[[VAL_10:.*]] = affine.apply #{{.*}}(%[[VAL_8]])
+// CHECK:             %[[VAL_11:.*]] = affine.load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref<100xf32>
 // CHECK:             %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
 // CHECK:             affine.yield %[[VAL_12]] : f32
 // CHECK:           }
+// Middle loop stays as fir.do_loop (non-promotable: fir.convert reinterprets pointer):
 // CHECK:           %[[VAL_13:.*]]:2 = fir.do_loop %[[VAL_14:.*]] = %[[VAL_0]] to %[[VAL_2]] step %[[VAL_0]] iter_args(%[[VAL_15:.*]] = %[[VAL_7]]) -> (index, f32) {
 // CHECK:             %[[VAL_16:.*]] = fir.array_coor %[[ARG1]](%[[VAL_4]]) %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, index, index) -> !fir.ref<f32>
 // CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<f32>) -> !fir.ref<!fir.array<100xf32>>
-// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (!fir.ref<!fir.array<100xf32>>) -> memref<?xf32>
-// CHECK:             %[[VAL_19:.*]] = affine.for %[[VAL_20:.*]] = %[[VAL_0]] to #{{.*}}(){{\[}}%[[VAL_2]]] iter_args(%[[VAL_21:.*]] = %[[VAL_15]]) -> (f32) {
-// CHECK:               %[[VAL_22:.*]] = affine.apply #{{.*}}(%[[VAL_20]]){{\[}}%[[VAL_0]], %[[VAL_2]], %[[VAL_0]]]
-// CHECK:               %[[VAL_23:.*]] = affine.load %[[VAL_18]]{{\[}}%[[VAL_22]]] : memref<?xf32>
+// Inner loop also stays as fir.do_loop:
+// CHECK:             %[[VAL_19:.*]] = fir.do_loop %[[VAL_20:.*]] = %[[VAL_0]] to %[[VAL_2]] step %[[VAL_0]] iter_args(%[[VAL_21:.*]] = %[[VAL_15]]) -> (f32) {
+// CHECK:               %[[VAL_22:.*]] = fir.array_coor %[[VAL_17]](%[[VAL_3]]) %[[VAL_20]] : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<f32>
 // CHECK:               %[[VAL_24:.*]] = arith.addf %[[VAL_21]], %[[VAL_23]] fastmath<contract> : f32
-// CHECK:               affine.yield %[[VAL_24]] : f32
+// CHECK:               fir.result %[[VAL_24]] : f32
 // CHECK:             }
 // CHECK:             %[[VAL_25:.*]] = arith.addi %[[VAL_14]], %[[VAL_0]] overflow<nsw> : index
 // CHECK:             fir.result %[[VAL_25]], %[[VAL_19]] : index, f32
 // CHECK:           }
-// CHECK:           %[[VAL_26:.*]] = fir.convert %[[ARG2]] : (!fir.ref<!fir.array<100xf32>>) -> memref<?xf32>
-// CHECK:           %[[VAL_27:.*]]:2 = affine.for %[[VAL_28:.*]] = %[[VAL_0]] to #{{.*}}(){{\[}}%[[VAL_2]]] iter_args(%[[VAL_29:.*]] = %[[VAL_30:.*]]#1, %[[VAL_31:.*]] = %[[VAL_1]]) -> (f32, f32) {
-// CHECK:             %[[VAL_32:.*]] = affine.apply #{{.*}}(%[[VAL_28]]){{\[}}%[[VAL_0]], %[[VAL_2]], %[[VAL_0]]]
-// CHECK:             %[[VAL_33:.*]] = affine.load %[[VAL_6]]{{\[}}%[[VAL_32]]] : memref<?xf32>
+// Last loop promoted — dual reduction:
+// CHECK:           %[[VAL_26:.*]] = fir.convert %[[ARG2]] : (!fir.ref<!fir.array<100xf32>>) -> memref<100xf32>
+// CHECK:           %[[VAL_27:.*]]:2 = affine.for %[[VAL_28:.*]] = 1 to 101 iter_args(%[[VAL_29:.*]] = %[[VAL_30:.*]]#1, %[[VAL_31:.*]] = %[[VAL_1]]) -> (f32, f32) {
+// CHECK:             %[[VAL_32:.*]] = affine.apply #{{.*}}(%[[VAL_28]])
+// CHECK:             %[[VAL_33:.*]] = affine.load %[[VAL_6]]{{\[}}%[[VAL_32]]] : memref<100xf32>
 // CHECK:             %[[VAL_34:.*]] = arith.addf %[[VAL_29]], %[[VAL_33]] fastmath<contract> : f32
-// CHECK:             %[[VAL_35:.*]] = affine.load %[[VAL_26]]{{\[}}%[[VAL_32]]] : memref<?xf32>
+// CHECK:             %[[VAL_35:.*]] = affine.load %[[VAL_26]]{{\[}}%[[VAL_32]]] : memref<100xf32>
 // CHECK:             %[[VAL_36:.*]] = arith.addf %[[VAL_31]], %[[VAL_35]] fastmath<contract> : f32
 // CHECK:             affine.yield %[[VAL_34]], %[[VAL_36]] : f32, f32
 // CHECK:           }
@@ -217,3 +218,99 @@ func.func @loop_with_result(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.re
 // CHECK:           fir.store %[[VAL_39]] to %[[VAL_5]] : !fir.ref<i32>
 // CHECK:           return %[[VAL_37]] : f32
 // CHECK:         }
+
+
+// -----
+
+// Test: Simple matrix multiplication C(j,i) += A(k,i) * B(j,k)
+// Triple-nested loop with all three arrays using 2D indexing.
+// Fortran: do i=1,N; do j=1,N; do k=1,N; C(j,i) = C(j,i) + A(k,i)*B(j,k); end do; end do; end do
+func.func @matmul(%a: !fir.ref<!fir.array<100x100xf32>>, %b: !fir.ref<!fir.array<100x100xf32>>, %c: !fir.ref<!fir.array<100x100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %shp = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+  fir.do_loop %i = %c1 to %c100 step %c1 {
+    fir.do_loop %j = %c1 to %c100 step %c1 {
+      fir.do_loop %k = %c1 to %c100 step %c1 {
+        %k32 = fir.convert %k : (index) -> i32
+        %k64 = fir.convert %k32 : (i32) -> i64
+        %i32 = fir.convert %i : (index) -> i32
+        %i64 = fir.convert %i32 : (i32) -> i64
+        %j32 = fir.convert %j : (index) -> i32
+        %j64 = fir.convert %j32 : (i32) -> i64
+        %a_idx = fir.array_coor %a(%shp) %k64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+        %a_val = fir.load %a_idx : !fir.ref<f32>
+        %b_idx = fir.array_coor %b(%shp) %j64, %k64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+        %b_val = fir.load %b_idx : !fir.ref<f32>
+        %mul = arith.mulf %a_val, %b_val fastmath<contract> : f32
+        %c_idx = fir.array_coor %c(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+        %c_val = fir.load %c_idx : !fir.ref<f32>
+        %sum = arith.addf %c_val, %mul fastmath<contract> : f32
+        fir.store %sum to %c_idx : !fir.ref<f32>
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @matmul(
+// CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.array<100x100xf32>>, %[[B:.*]]: !fir.ref<!fir.array<100x100xf32>>, %[[C:.*]]: !fir.ref<!fir.array<100x100xf32>>)
+// CHECK-DAG:   %[[C100:.*]] = arith.constant 100 : index
+// CHECK:       %[[SHP:.*]] = fir.shape %[[C100]], %[[C100]]
+// CHECK:       %[[AM:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+// CHECK:       %[[BM:.*]] = fir.convert %[[B]] : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+// CHECK:       %[[CM:.*]] = fir.convert %[[C]] : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+// CHECK:       affine.for %[[I:.*]] = 1 to 101 {
+// CHECK:         %[[IDXI:.*]] = affine.apply #{{.*}}(%[[I]])
+// CHECK:         affine.for %[[J:.*]] = 1 to 101 {
+// CHECK:           %[[IDXJ:.*]] = affine.apply #{{.*}}(%[[J]])
+// CHECK:           %[[CVAL:.*]] = affine.load %[[CM]][%[[IDXI]], %[[IDXJ]]] : memref<100x100xf32>
+// CHECK:           affine.for %[[K:.*]] = 1 to 101 {
+// CHECK:             %[[IDXK:.*]] = affine.apply #{{.*}}(%[[K]])
+// CHECK:             %[[AVAL:.*]] = affine.load %[[AM]][%[[IDXI]], %[[IDXK]]] : memref<100x100xf32>
+// CHECK:             %[[BVAL:.*]] = affine.load %[[BM]][%[[IDXK]], %[[IDXJ]]] : memref<100x100xf32>
+// CHECK:             %[[MUL:.*]] = arith.mulf %[[AVAL]], %[[BVAL]] fastmath<contract> : f32
+// CHECK:             %[[SUM:.*]] = arith.addf %[[CVAL]], %[[MUL]] fastmath<contract> : f32
+// CHECK:             affine.store %[[SUM]], %[[CM]][%[[IDXI]], %[[IDXJ]]] : memref<100x100xf32>
+// CHECK:           }
+// CHECK:         }
+// CHECK:       }
+// CHECK:       return
+
+// -----
+
+// Test: Triangular loop promoted to affine.for with dimension-based upper bound.
+// Inner bound j=1..i uses affine_map<(d0) -> (d0 + 1)>(%outer_iv) — a dimension,
+// not a symbol, because the outer IV is a loop induction variable.
+func.func @triangular_promotion(%a: !fir.ref<!fir.array<100x100xf32>>, %b: !fir.ref<!fir.array<100x100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %shp = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+  fir.do_loop %i = %c1 to %c100 step %c1 {
+    fir.do_loop %j = %c1 to %i step %c1 {
+      %j32 = fir.convert %j : (index) -> i32
+      %j64 = fir.convert %j32 : (i32) -> i64
+      %i32 = fir.convert %i : (index) -> i32
+      %i64 = fir.convert %i32 : (i32) -> i64
+      %a_idx = fir.array_coor %a(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      %a_val = fir.load %a_idx : !fir.ref<f32>
+      %b_idx = fir.array_coor %b(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      fir.store %a_val to %b_idx : !fir.ref<f32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @triangular_promotion(
+// CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.array<100x100xf32>>, %[[B:.*]]: !fir.ref<!fir.array<100x100xf32>>)
+// CHECK-DAG:   %[[AM:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+// CHECK-DAG:   %[[BM:.*]] = fir.convert %[[B]] : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+// Outer loop: constant bounds (folded by AffineIndexBuilder)
+// CHECK:       affine.for %[[I:.*]] = 1 to 101 {
+// Inner loop: upper bound is affine_map<(d0) -> (d0 + 1)>(%outer_iv) — dimension
+// CHECK:         affine.for %[[J:.*]] = 1 to #{{.*}}(%[[I]]) {
+// CHECK:           affine.load %[[AM]][%{{.*}}, %{{.*}}] : memref<100x100xf32>
+// CHECK:           affine.store %{{.*}}, %[[BM]][%{{.*}}, %{{.*}}] : memref<100x100xf32>
+// CHECK:         }
+// CHECK:       }
+// CHECK:       return
diff --git a/flang/test/Transforms/simplify-do-loop.fir b/flang/test/Transforms/simplify-do-loop.fir
new file mode 100644
index 0000000000000..1cba02b834ade
--- /dev/null
+++ b/flang/test/Transforms/simplify-do-loop.fir
@@ -0,0 +1,322 @@
+// Test simplify-fir-loop pass
+// Canonicalizes fir.do_loop nests by removing shadow iter_args, forwarding
+// IV loads, and emitting final IV value stores after the outermost loop.
+
+// RUN: fir-opt --split-input-file --simplify-fir-loop -cse %s | FileCheck %s
+
+// -----
+
+// Test 1: Simple 1D loop — iter_arg removed, IV loads forwarded, final store
+// emitted after the loop.
+// Fortran: do i = 1, 100; b(i) = a(i); end do
+func.func @simple_1d(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.ref<!fir.array<100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.convert %c1 : (index) -> i32
+  %4 = fir.do_loop %arg2 = %c1 to %c100 step %c1 iter_args(%arg3 = %3) -> (i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %5 = fir.load %2 : !fir.ref<i32>
+    %6 = fir.convert %5 : (i32) -> i64
+    %7 = fir.array_coor %arg0(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    %8 = fir.load %7 : !fir.ref<f32>
+    %9 = fir.array_coor %arg1(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    fir.store %8 to %9 : !fir.ref<f32>
+    %10 = fir.load %2 : !fir.ref<i32>
+    %11 = arith.addi %10, %3 overflow<nsw> : i32
+    fir.result %11 : i32
+  }
+  fir.store %4 to %2 : !fir.ref<i32>
+  return
+}
+
+// CHECK-LABEL: func.func @simple_1d(
+// CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.array<100xf32>>, %[[B:.*]]: !fir.ref<!fir.array<100xf32>>)
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C100:.*]] = arith.constant 100 : index
+// CHECK:       %[[SHAPE:.*]] = fir.shape %[[C100]]
+// CHECK:       %[[IV_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i"
+// CHECK:       %[[IV_DECL:.*]] = fir.declare %[[IV_ALLOCA]]
+// iter_args must be removed:
+// CHECK:       fir.do_loop %[[I:.*]] = %[[C1]] to %[[C100]] step %[[C1]] {
+// CHECK-NOT:     iter_args
+// IV loads must be forwarded to fir.convert(IV):
+// CHECK:         %[[I_I32:.*]] = fir.convert %[[I]] : (index) -> i32
+// CHECK:         %[[I_I64:.*]] = fir.convert %[[I_I32]] : (i32) -> i64
+// CHECK:         %[[A_IDX:.*]] = fir.array_coor %[[A]](%[[SHAPE]]) %[[I_I64]]
+// CHECK:         %[[A_VAL:.*]] = fir.load %[[A_IDX]]
+// CHECK:         %[[B_IDX:.*]] = fir.array_coor %[[B]](%[[SHAPE]]) %[[I_I64]]
+// CHECK:         fir.store %[[A_VAL]] to %[[B_IDX]]
+// CHECK:       }
+// Final IV value: lb + max(0, (ub - lb + step) / step) * step
+// CHECK:       %[[SUB:.*]] = arith.subi %[[C100]], %[[C1]] : index
+// CHECK:       %[[ADD:.*]] = arith.addi %[[SUB]], %[[C1]] : index
+// CHECK:       %[[DIV:.*]] = arith.divsi %[[ADD]], %[[C1]] : index
+// CHECK:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK:       %[[CMP:.*]] = arith.cmpi sgt, %[[DIV]], %[[C0]] : index
+// CHECK:       %[[TRIP:.*]] = arith.select %[[CMP]], %[[DIV]], %[[C0]] : index
+// CHECK:       %[[MUL:.*]] = arith.muli %[[TRIP]], %[[C1]] : index
+// CHECK:       %[[FINALIDX:.*]] = arith.addi %[[C1]], %[[MUL]] : index
+// CHECK:       %[[FINAL:.*]] = fir.convert %[[FINALIDX]] : (index) -> i32
+// CHECK:       fir.store %[[FINAL]] to %[[IV_DECL]]
+// CHECK:       return
+
+// -----
+
+// Test 2: Nested 2D loop — both iter_args removed, final stores for i and j.
+// Fortran: do i = 1,100; do j = 1,100; c(j,i) = a(j,i) + b(j,i); end do; end do
+func.func @nested_2d(%a: !fir.ref<!fir.array<100x100xf32>>, %b: !fir.ref<!fir.array<100x100xf32>>, %c: !fir.ref<!fir.array<100x100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %shp = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+  %ad = fir.declare %a(%shp) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %bd = fir.declare %b(%shp) {uniq_name = "_QFEb"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %cd = fir.declare %c(%shp) {uniq_name = "_QFEc"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %i_alloca = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %i_decl = fir.declare %i_alloca {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %j_alloca = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFEj"}
+  %j_decl = fir.declare %j_alloca {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %init = fir.convert %c1 : (index) -> i32
+  %outer = fir.do_loop %i = %c1 to %c100 step %c1 iter_args(%i_arg = %init) -> (i32) {
+    fir.store %i_arg to %i_decl : !fir.ref<i32>
+    %inner = fir.do_loop %j = %c1 to %c100 step %c1 iter_args(%j_arg = %init) -> (i32) {
+      fir.store %j_arg to %j_decl : !fir.ref<i32>
+      %jv = fir.load %j_decl : !fir.ref<i32>
+      %j64 = fir.convert %jv : (i32) -> i64
+      %iv = fir.load %i_decl : !fir.ref<i32>
+      %i64 = fir.convert %iv : (i32) -> i64
+      %a_idx = fir.array_coor %ad(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      %a_val = fir.load %a_idx : !fir.ref<f32>
+      %b_idx = fir.array_coor %bd(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      %b_val = fir.load %b_idx : !fir.ref<f32>
+      %sum = arith.addf %a_val, %b_val fastmath<contract> : f32
+      %c_idx = fir.array_coor %cd(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      fir.store %sum to %c_idx : !fir.ref<f32>
+      %jv2 = fir.load %j_decl : !fir.ref<i32>
+      %j_next = arith.addi %jv2, %init overflow<nsw> : i32
+      fir.result %j_next : i32
+    }
+    fir.store %inner to %j_decl : !fir.ref<i32>
+    %iv2 = fir.load %i_decl : !fir.ref<i32>
+    %i_next = arith.addi %iv2, %init overflow<nsw> : i32
+    fir.result %i_next : i32
+  }
+  fir.store %outer to %i_decl : !fir.ref<i32>
+  return
+}
+
+// CHECK-LABEL: func.func @nested_2d(
+// Both loops must have no iter_args:
+// CHECK:       fir.do_loop %[[I:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:         fir.do_loop %[[J:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK-NOT:       iter_args
+// IV loads forwarded — j and i accessed via fir.convert of loop IVs:
+// CHECK:           %[[J_I32:.*]] = fir.convert %[[J]] : (index) -> i32
+// CHECK:           %[[J_I64:.*]] = fir.convert %[[J_I32]] : (i32) -> i64
+// CHECK:           %[[I_I32:.*]] = fir.convert %[[I]] : (index) -> i32
+// CHECK:           %[[I_I64:.*]] = fir.convert %[[I_I32]] : (i32) -> i64
+// CHECK:         }
+// CHECK:       }
+// Final stores for both i and j after the outermost loop:
+// CHECK:       fir.store %{{.*}} to %{{.*}} : !fir.ref<i32>
+// CHECK:       fir.store %{{.*}} to %{{.*}} : !fir.ref<i32>
+// CHECK:       return
+
+// -----
+
+// Test 3: Triangular loop — inner bound depends on outer IV.
+// Fortran: do i = 1,100; do j = 1,i; c(j,i) = a(j,i) + b(j,i); end do; end do
+func.func @triangular(%a: !fir.ref<!fir.array<100x100xf32>>, %b: !fir.ref<!fir.array<100x100xf32>>, %c: !fir.ref<!fir.array<100x100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %shp = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+  %ad = fir.declare %a(%shp) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %bd = fir.declare %b(%shp) {uniq_name = "_QFEb"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %cd = fir.declare %c(%shp) {uniq_name = "_QFEc"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %i_alloca = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %i_decl = fir.declare %i_alloca {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %j_alloca = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFEj"}
+  %j_decl = fir.declare %j_alloca {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %init = fir.convert %c1 : (index) -> i32
+  %outer = fir.do_loop %i = %c1 to %c100 step %c1 iter_args(%i_arg = %init) -> (i32) {
+    fir.store %i_arg to %i_decl : !fir.ref<i32>
+    %i_val = fir.load %i_decl : !fir.ref<i32>
+    %i_idx = fir.convert %i_val : (i32) -> index
+    %inner = fir.do_loop %j = %c1 to %i_idx step %c1 iter_args(%j_arg = %init) -> (i32) {
+      fir.store %j_arg to %j_decl : !fir.ref<i32>
+      %jv = fir.load %j_decl : !fir.ref<i32>
+      %j64 = fir.convert %jv : (i32) -> i64
+      %iv = fir.load %i_decl : !fir.ref<i32>
+      %i64 = fir.convert %iv : (i32) -> i64
+      %a_idx = fir.array_coor %ad(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      %a_val = fir.load %a_idx : !fir.ref<f32>
+      %b_idx = fir.array_coor %bd(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      %b_val = fir.load %b_idx : !fir.ref<f32>
+      %sum = arith.addf %a_val, %b_val fastmath<contract> : f32
+      %c_idx = fir.array_coor %cd(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      fir.store %sum to %c_idx : !fir.ref<f32>
+      %jv2 = fir.load %j_decl : !fir.ref<i32>
+      %j_next = arith.addi %jv2, %init overflow<nsw> : i32
+      fir.result %j_next : i32
+    }
+    fir.store %inner to %j_decl : !fir.ref<i32>
+    %iv2 = fir.load %i_decl : !fir.ref<i32>
+    %i_next = arith.addi %iv2, %init overflow<nsw> : i32
+    fir.result %i_next : i32
+  }
+  fir.store %outer to %i_decl : !fir.ref<i32>
+  return
+}
+
+// CHECK-LABEL: func.func @triangular(
+// Both loops transformed — no iter_args:
+// CHECK:       fir.do_loop %[[I:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// Inner loop bound uses the outer IV (through fir.convert chain):
+// CHECK:         fir.do_loop %[[J:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK-NOT:       iter_args
+// CHECK:         }
+// CHECK:       }
+// Final stores for both i and j after outermost loop:
+// CHECK:       fir.store %{{.*}} to %{{.*}} : !fir.ref<i32>
+// CHECK:       fir.store %{{.*}} to %{{.*}} : !fir.ref<i32>
+// CHECK:       return
+
+// -----
+
+// Test 4: Non-unit step — do i = 1, 100, 3
+func.func @non_unit_step(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.ref<!fir.array<100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c100 = arith.constant 100 : index
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.convert %c1 : (index) -> i32
+  %4 = fir.do_loop %arg2 = %c1 to %c100 step %c3 iter_args(%arg3 = %3) -> (i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %5 = fir.load %2 : !fir.ref<i32>
+    %6 = fir.convert %5 : (i32) -> i64
+    %7 = fir.array_coor %arg0(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    %8 = fir.load %7 : !fir.ref<f32>
+    %9 = fir.array_coor %arg1(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    fir.store %8 to %9 : !fir.ref<f32>
+    %10 = fir.convert %c3 : (index) -> i32
+    %11 = fir.load %2 : !fir.ref<i32>
+    %12 = arith.addi %11, %10 overflow<nsw> : i32
+    fir.result %12 : i32
+  }
+  fir.store %4 to %2 : !fir.ref<i32>
+  return
+}
+
+// CHECK-LABEL: func.func @non_unit_step(
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C100:.*]] = arith.constant 100 : index
+// Loop with step 3 and no iter_args:
+// CHECK:       fir.do_loop %[[I:.*]] = %[[C1]] to %[[C100]] step %[[C3]] {
+// CHECK-NOT:     iter_args
+// CHECK:       }
+// Final IV value: lb + trip_count * step = 1 + 34 * 3 = 103
+// CHECK:       %[[FINAL_I32:.*]] = fir.convert %{{.*}} : (index) -> i32
+// CHECK:       fir.store %[[FINAL_I32]] to %{{.*}} : !fir.ref<i32>
+// CHECK:       return
+
+// -----
+
+// Test 5: Rejection — loop with fir.call must NOT be transformed.
+func.func @rejection_with_call(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.ref<!fir.array<100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.convert %c1 : (index) -> i32
+  %4 = fir.do_loop %arg2 = %c1 to %c100 step %c1 iter_args(%arg3 = %3) -> (i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %5 = fir.load %2 : !fir.ref<i32>
+    %6 = fir.convert %5 : (i32) -> i64
+    %7 = fir.array_coor %arg0(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    %8 = fir.load %7 : !fir.ref<f32>
+    fir.call @_QPuser_sub(%7) fastmath<contract> : (!fir.ref<f32>) -> ()
+    %9 = fir.array_coor %arg1(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    fir.store %8 to %9 : !fir.ref<f32>
+    %10 = fir.load %2 : !fir.ref<i32>
+    %11 = arith.addi %10, %3 overflow<nsw> : i32
+    fir.result %11 : i32
+  }
+  fir.store %4 to %2 : !fir.ref<i32>
+  return
+}
+func.func private @_QPuser_sub(!fir.ref<f32>)
+
+// CHECK-LABEL: func.func @rejection_with_call(
+// Loop must remain UNCHANGED — iter_args still present:
+// CHECK:       fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}} = %{{.*}}) -> (i32) {
+// CHECK:         fir.call @_QPuser_sub
+// CHECK:         fir.result %{{.*}} : i32
+// CHECK:       }
+// CHECK:       return
+
+// -----
+
+// Test 6: Rejection — iter_arg carries a reduction value (sum), not the IV.
+// The pass must NOT transform this loop because the iter_arg is not a shadow
+// of the induction variable.
+// Fortran: sum = 0.0; do i = 1, 100; sum = sum + a(i); end do
+func.func @rejection_reduction(%arg0: !fir.ref<!fir.array<100xf32>>, %sum_ref: !fir.ref<f32>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %cst = arith.constant 0.000000e+00 : f32
+  %1 = fir.do_loop %arg1 = %c1 to %c100 step %c1 iter_args(%sum = %cst) -> (f32) {
+    %2 = fir.convert %arg1 : (index) -> i64
+    %3 = fir.array_coor %arg0(%0) %2 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    %4 = fir.load %3 : !fir.ref<f32>
+    %5 = arith.addf %sum, %4 fastmath<contract> : f32
+    fir.result %5 : f32
+  }
+  fir.store %1 to %sum_ref : !fir.ref<f32>
+  return
+}
+
+// CHECK-LABEL: func.func @rejection_reduction(
+// Loop must remain UNCHANGED — iter_args still present (reduction, not IV):
+// CHECK:       fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}} = %{{.*}}) -> (f32) {
+// CHECK:         arith.addf
+// CHECK:         fir.result %{{.*}} : f32
+// CHECK:       }
+// CHECK:       fir.store
+// CHECK:       return
+
+// -----
+
+// Test 7: Rejection — iter_arg is i32 but init value is NOT fir.convert(lb).
+// The init is an arbitrary constant, not derived from the loop lower bound.
+func.func @rejection_non_iv_init(%arg0: !fir.ref<!fir.array<100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %c42_i32 = arith.constant 42 : i32
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %1 = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
+  %2 = fir.declare %1 {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.do_loop %arg1 = %c1 to %c100 step %c1 iter_args(%x = %c42_i32) -> (i32) {
+    fir.store %x to %2 : !fir.ref<i32>
+    %4 = fir.load %2 : !fir.ref<i32>
+    %c2_i32 = arith.constant 2 : i32
+    %5 = arith.addi %4, %c2_i32 overflow<nsw> : i32
+    fir.result %5 : i32
+  }
+  fir.store %3 to %2 : !fir.ref<i32>
+  return
+}
+
+// CHECK-LABEL: func.func @rejection_non_iv_init(
+// Loop must remain UNCHANGED — init is 42, not fir.convert(lb):
+// CHECK:       %[[C42:.*]] = arith.constant 42 : i32
+// CHECK:       fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}} = %[[C42]]) -> (i32) {
+// CHECK:         fir.result %{{.*}} : i32
+// CHECK:       }
+// CHECK:       return

>From e9b4ffb56b4ce8bb7ac4233b25aa5e154b713136 Mon Sep 17 00:00:00 2001
From: Shubham Yadav <shuyadav at amd.com>
Date: Wed, 15 Apr 2026 01:32:21 +0530
Subject: [PATCH 2/3] Address review comments

---
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  9 ++--
 .../Optimizer/Transforms/SimplifyDoLoop.cpp   | 45 ++++++++++---------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 0b09ac5c2f37e..cd0f5a61fa2ec 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -200,15 +200,12 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
     pm.addPass(mlir::createCanonicalizerPass(config));
     pm.addPass(mlir::createCSEPass());
 
-    addNestedPassToAllTopLevelOperations<PassConstructor>(
-        pm, fir::createSimplifyDoLoopPass);
-
-    pm.addPass(mlir::createCanonicalizerPass(config));
-    pm.addPass(mlir::createCSEPass());
-
     pm.addPass(mlir::createLoopInvariantCodeMotionPass());
     pm.addPass(fir::createLoopInvariantCodeMotion());
 
+    addNestedPassToAllTopLevelOperations<PassConstructor>(
+        pm, fir::createSimplifyDoLoopPass);
+
     pm.addPass(mlir::createCanonicalizerPass(config));
     pm.addPass(mlir::createCSEPass());
 
diff --git a/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
index 12f3c5a9a1f91..f48ec7277e7f6 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
@@ -104,9 +104,10 @@ static SmallVector<Value, 2> collectAliases(Value ivRef) {
   return aliases;
 }
 
-/// Collect a perfectly nested chain of fir.do_loop ops starting from `outer`.
-/// A loop is considered perfectly nested if between each nesting level only
-/// IV-related operations (stores, converts) and the inner loop exist.
+/// Collect a singly-nested chain of fir.do_loop ops starting from `outer`.
+/// Each loop body must contain exactly one inner fir.do_loop; other operations
+/// are permitted.  Safety checks (no calls, single IV store, IV doesn't escape)
+/// are enforced later by analyzeNest().
 static SmallVector<fir::DoLoopOp> collectNest(fir::DoLoopOp outer) {
   SmallVector<fir::DoLoopOp> nest;
   fir::DoLoopOp cur = outer;
@@ -259,11 +260,21 @@ static bool noCallsInNest(fir::DoLoopOp outermost) {
 
 static bool ivDoesNotEscape(ArrayRef<Value> ivAliases) {
   for (auto alias : ivAliases)
-    for (auto *user : alias.getUsers())
-      if (!isa<fir::StoreOp, fir::LoadOp, fir::DeclareOp>(user)) {
+    for (auto *user : alias.getUsers()) {
+      if (auto store = dyn_cast<fir::StoreOp>(user)) {
+        if (store.getMemref() != alias) {
+          LLVM_DEBUG(llvm::dbgs()
+                     << "  [escape] IV used as stored value: " << *user
+                     << "\n");
+          return false;
+        }
+        continue;
+      }
+      if (!isa<fir::LoadOp, fir::DeclareOp>(user)) {
         LLVM_DEBUG(llvm::dbgs() << "  [escape] IV escapes: " << *user << "\n");
         return false;
       }
+    }
   return true;
 }
 
@@ -359,25 +370,20 @@ static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
       return it->second;
     return val;
   }
-  if (auto *defOp = val.getDefiningOp()) {
-    if (!outermost->isAncestor(defOp))
-      return val;
-  }
-
   auto *defOp = val.getDefiningOp();
-  if (!defOp)
+  if (!defOp || !outermost->isAncestor(defOp))
     return val;
 
   // fir.convert: rematerialize the input, then re-emit the convert.
   if (auto conv = dyn_cast<fir::ConvertOp>(*defOp)) {
-    auto newInput = rematerializeOutside(conv.getValue(), outermost, builder,
+    Value newInput = rematerializeOutside(conv.getValue(), outermost, builder,
                                          loc, ivFinalMap);
     return fir::ConvertOp::create(builder, loc, conv.getType(), newInput);
   }
 
   // fir.load: the address must already be outside (alloca/declare/etc).
   if (auto load = dyn_cast<fir::LoadOp>(*defOp)) {
-    auto addr = rematerializeOutside(load.getMemref(), outermost, builder, loc,
+    Value addr = rematerializeOutside(load.getMemref(), outermost, builder, loc,
                                      ivFinalMap);
     return fir::LoadOp::create(builder, loc, addr);
   }
@@ -402,7 +408,6 @@ static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
     return cloned->getResult(0);
   }
 
-  // For anything else, assume it's already available.
   return val;
 }
 
@@ -426,8 +431,8 @@ static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
 /// the Fortran final value (which is one step past the last iteration).
 /// Example: for `do i=1,100; do j=1,i`, j's final value must be computed
 /// with i=100 (last iteration), not i=101 (Fortran final).
-static void emitFinalIVStore(OpBuilder &builder, Location loc, LoopIVInfo &info,
-                             fir::DoLoopOp outermost,
+static void emitFinalIVStore(OpBuilder &builder, Location loc,
+                             LoopIVInfo &info, fir::DoLoopOp outermost,
                              DenseMap<Value, Value> &ivFinalMap) {
   // Rematerialize bounds outside the outermost loop if needed.
   // For inner loops with IV-dependent bounds (e.g. do j=1,i), the outer IV
@@ -577,7 +582,7 @@ static fir::DoLoopOp transformOneLoop(fir::DoLoopOp loop,
 class SimplifyDoLoop : public fir::impl::SimplifyDoLoopBase<SimplifyDoLoop> {
 public:
   void runOnOperation() override {
-    auto func = getOperation();
+    mlir::func::FuncOp func = getOperation();
 
     // Collect all outermost fir.do_loop ops.
     SmallVector<fir::DoLoopOp> outerLoops;
@@ -586,8 +591,8 @@ class SimplifyDoLoop : public fir::impl::SimplifyDoLoopBase<SimplifyDoLoop> {
         outerLoops.push_back(loop);
     });
 
-    for (auto outerLoop : outerLoops) {
-      auto nestLoops = collectNest(outerLoop);
+    for (fir::DoLoopOp outerLoop : outerLoops) {
+      SmallVector<fir::DoLoopOp> nestLoops = collectNest(outerLoop);
       LLVM_DEBUG(llvm::dbgs()
                  << "SimplifyDoLoop: nest depth " << nestLoops.size() << " at "
                  << outerLoop.getLoc() << "\n");
@@ -599,7 +604,7 @@ class SimplifyDoLoop : public fir::impl::SimplifyDoLoopBase<SimplifyDoLoop> {
 
       // ======== Analysis Phase ========
       SmallVector<LoopIVInfo> infos;
-      for (auto loop : nestLoops)
+      for (fir::DoLoopOp loop : nestLoops)
         infos.push_back({loop, {}, {}, {}, {}, {}, {}});
 
       if (!analyzeNest(infos)) {

>From df728d32a92a695cbb124dd11a2434dc9365165f Mon Sep 17 00:00:00 2001
From: Shubham Yadav <shuyadav at amd.com>
Date: Wed, 15 Apr 2026 17:20:10 +0530
Subject: [PATCH 3/3] Address review comments: add canSafelyRematerialize() to
 reject loop nests whose bounds depend on non-IV loads modified inside the
 loop.

---
 .../Optimizer/Transforms/SimplifyDoLoop.cpp   | 99 +++++++++++++++----
 1 file changed, 80 insertions(+), 19 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
index f48ec7277e7f6..6781d4084035e 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
@@ -34,7 +34,7 @@
 //      a. Remove the initial store (fir.store %iter_arg to %iv_alloca)
 //      b. Forward all loads of IV alloca inside loop body to fir.convert(IV)
 //      todo: the forwarding of load of iv alloca can be done by some other pass
-//      like fir-memref-dataflow-opt pass (if it is available). 
+//      like fir-memref-dataflow-opt pass (if it is available).
 //      c. Strip iter_args and fir.result, rebuild as simple fir.do_loop
 //   2. After the outermost loop, compute and store final IV values
 //      for all loops whose IV is live after the loop (outer to inner order).
@@ -50,6 +50,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
@@ -263,9 +264,8 @@ static bool ivDoesNotEscape(ArrayRef<Value> ivAliases) {
     for (auto *user : alias.getUsers()) {
       if (auto store = dyn_cast<fir::StoreOp>(user)) {
         if (store.getMemref() != alias) {
-          LLVM_DEBUG(llvm::dbgs()
-                     << "  [escape] IV used as stored value: " << *user
-                     << "\n");
+          LLVM_DEBUG(llvm::dbgs() << "  [escape] IV used as stored value: "
+                                  << *user << "\n");
           return false;
         }
         continue;
@@ -278,6 +278,57 @@ static bool ivDoesNotEscape(ArrayRef<Value> ivAliases) {
   return true;
 }
 
+// ---- Check if a bound value can be safely rematerialized after the loop ---
+// Runs during analysis (pre-transformation) to reject nests whose bounds
+// contain ops that cannot be correctly duplicated after the outermost loop.
+//
+// Safe:  values defined outside the outermost loop, loop IVs (block args of
+//        fir.do_loop — resolved via ivFinalMap), fir.convert, arith constants,
+//        and arithmetic over safe values.  Loads of IV allocas are safe because
+//        transformOneLoop will forward them to fir.convert(IV) before
+//        rematerializeOutside runs.
+// Unsafe: fir.load of a non-IV address inside the loop — the memory may have
+//         been modified between the original load and the post-loop insertion
+//         point, so duplicating the load would read a wrong value.
+
+static bool canSafelyRematerialize(Value val, fir::DoLoopOp outermost,
+                                   ArrayRef<LoopIVInfo> infos) {
+  if (auto blockArg = dyn_cast<BlockArgument>(val)) {
+    auto *owner = blockArg.getOwner()->getParentOp();
+    if (!outermost->isAncestor(owner))
+      return true;
+    return isa<fir::DoLoopOp>(owner);
+  }
+
+  auto *defOp = val.getDefiningOp();
+  if (!defOp || !outermost->isAncestor(defOp))
+    return true;
+
+  if (auto conv = dyn_cast<fir::ConvertOp>(*defOp))
+    return canSafelyRematerialize(conv.getValue(), outermost, infos);
+
+  if (auto load = dyn_cast<fir::LoadOp>(*defOp)) {
+    for (const auto &info : infos)
+      if (llvm::is_contained(info.ivAliases, load.getMemref()))
+        return true;
+    LLVM_DEBUG(llvm::dbgs() << "  [remat] non-IV load in bound: " << *defOp
+                            << "\n");
+    return false;
+  }
+
+  if (isa<arith::ConstantOp>(*defOp))
+    return true;
+
+  if (defOp->getNumResults() == 1 && mlir::isPure(defOp)) {
+    for (Value operand : defOp->getOperands())
+      if (!canSafelyRematerialize(operand, outermost, infos))
+        return false;
+    return true;
+  }
+
+  return false;
+}
+
 // ---- Full nest analysis ---------------------------------------------------
 
 static bool analyzeNest(SmallVector<LoopIVInfo> &infos) {
@@ -343,6 +394,19 @@ static bool analyzeNest(SmallVector<LoopIVInfo> &infos) {
     }
   }
 
+  // --- Verify that loop bounds can be safely rematerialized after the loop ---
+  fir::DoLoopOp outermost = infos.front().loop;
+  for (auto &info : infos) {
+    if (!canSafelyRematerialize(info.lowerBound, outermost, infos) ||
+        !canSafelyRematerialize(info.upperBound, outermost, infos) ||
+        !canSafelyRematerialize(info.step, outermost, infos)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  bounds not safely rematerializable at "
+                 << info.loop.getLoc() << "\n");
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -353,7 +417,12 @@ static bool analyzeNest(SmallVector<LoopIVInfo> &infos) {
 /// Ensure a value is available (dominates) at the current insertion point.
 /// If the value is already defined outside `outermost`, return it directly.
 /// Otherwise, rematerialize the computation by cloning through simple ops
-/// (fir.convert, fir.load, arith constants).
+/// (fir.convert, arith constants, arithmetic).
+///
+/// Precondition: canSafelyRematerialize() has already verified that the
+/// bound values do not depend on non-IV loads inside the loop.  Any IV loads
+/// (fir.load of IV alloca) have been forwarded to fir.convert(IV) by
+/// transformOneLoop before this function is called.
 ///
 /// `ivFinalMap` maps loop induction variables (block arguments) to their
 /// already-computed final index values.  This allows inner loop bounds that
@@ -377,27 +446,19 @@ static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
   // fir.convert: rematerialize the input, then re-emit the convert.
   if (auto conv = dyn_cast<fir::ConvertOp>(*defOp)) {
     Value newInput = rematerializeOutside(conv.getValue(), outermost, builder,
-                                         loc, ivFinalMap);
+                                          loc, ivFinalMap);
     return fir::ConvertOp::create(builder, loc, conv.getType(), newInput);
   }
 
-  // fir.load: the address must already be outside (alloca/declare/etc).
-  if (auto load = dyn_cast<fir::LoadOp>(*defOp)) {
-    Value addr = rematerializeOutside(load.getMemref(), outermost, builder, loc,
-                                     ivFinalMap);
-    return fir::LoadOp::create(builder, loc, addr);
-  }
-
   // arith.constant: just clone it.
   if (isa<arith::ConstantOp>(*defOp)) {
     auto *cloned = builder.clone(*defOp);
     return cloned->getResult(0);
   }
 
-  // Arithmetic ops (addi, subi, muli, divsi, cmpi, select): rematerialize
-  // all operands recursively, then clone the op with new operands.
-  if (isa<arith::AddIOp, arith::SubIOp, arith::MulIOp, arith::DivSIOp,
-          arith::CmpIOp, arith::SelectOp>(*defOp)) {
+  // Pure ops (no side effects): rematerialize all operands recursively,
+  // then clone the op with new operands.
+  if (defOp->getNumResults() == 1 && mlir::isPure(defOp)) {
     SmallVector<Value> newOperands;
     for (auto operand : defOp->getOperands())
       newOperands.push_back(
@@ -431,8 +492,8 @@ static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
 /// the Fortran final value (which is one step past the last iteration).
 /// Example: for `do i=1,100; do j=1,i`, j's final value must be computed
 /// with i=100 (last iteration), not i=101 (Fortran final).
-static void emitFinalIVStore(OpBuilder &builder, Location loc,
-                             LoopIVInfo &info, fir::DoLoopOp outermost,
+static void emitFinalIVStore(OpBuilder &builder, Location loc, LoopIVInfo &info,
+                             fir::DoLoopOp outermost,
                              DenseMap<Value, Value> &ivFinalMap) {
   // Rematerialize bounds outside the outermost loop if needed.
   // For inner loops with IV-dependent bounds (e.g. do j=1,i), the outer IV