[flang-commits] [flang] [Flang] Add opt-in affine loop optimization pipeline (PR #191854)

Sun Apr 26 01:43:24 PDT 2026

https://github.com/shuyadav-dev updated https://github.com/llvm/llvm-project/pull/191854

>From cd59f79776a6b57d04a17191f5511a365e938417 Mon Sep 17 00:00:00 2001
From: Shubham Yadav <shuyadav at amd.com>
Date: Mon, 13 Apr 2026 22:11:42 +0530
Subject: [PATCH 1/4] [Flang] Add opt-in affine loop optimization pipeline Add
 a new SimplifyDoLoop canonicalization pass and enhance the existing
 AffinePromotion and AffineDemotion passes to enable MLIR affine loop
 transformations (tiling, fusion, interchange) on Fortran DO loops. The
 pipeline is gated behind --enable-affine-loop-opt

---
 .../flang/Optimizer/Passes/CommandLineOpts.h  |   2 +
 .../flang/Optimizer/Transforms/Passes.h       |   1 +
 .../flang/Optimizer/Transforms/Passes.td      |  26 +
 .../lib/Optimizer/Passes/CommandLineOpts.cpp  |   6 +
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  50 ++
 .../Optimizer/Transforms/AffineDemotion.cpp   | 237 ++++++-
 .../Optimizer/Transforms/AffinePromotion.cpp  | 359 ++++++----
 flang/lib/Optimizer/Transforms/CMakeLists.txt |   1 +
 .../Optimizer/Transforms/SimplifyDoLoop.cpp   | 639 ++++++++++++++++++
 flang/test/Fir/affine-demotion.fir            | 174 ++++-
 flang/test/Fir/affine-promotion.fir           | 151 ++++-
 flang/test/Transforms/simplify-do-loop.fir    | 322 +++++++++
 12 files changed, 1757 insertions(+), 211 deletions(-)
 create mode 100644 flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
 create mode 100644 flang/test/Transforms/simplify-do-loop.fir

diff --git a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
index 882f02032a3b8..4b48cc2abe165 100644
--- a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
+++ b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
@@ -56,6 +56,8 @@ extern llvm::cl::opt<bool> disableFirAliasTags;
 extern llvm::cl::opt<bool> disableFirAvc;
 extern llvm::cl::opt<bool> disableFirMao;
 extern llvm::cl::opt<bool> enableFirLICM;
+extern llvm::cl::opt<bool> enableAffineLoopOpt;
+extern llvm::cl::opt<unsigned> affineLoopOptTileSize;
 extern llvm::cl::opt<bool> useOldAliasTags;
 
 /// CodeGen Passes
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index adacd3cc0cf51..6e2170a3a23a2 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -45,6 +45,7 @@ enum class LICMNestedHoistingMode {
 #include "flang/Optimizer/Transforms/Passes.h.inc"
 
 std::unique_ptr<mlir::Pass> createAffineDemotionPass();
+std::unique_ptr<mlir::Pass> createSimplifyDoLoopPass();
 std::unique_ptr<mlir::Pass>
 createArrayValueCopyPass(fir::ArrayValueCopyOptions options = {});
 std::unique_ptr<mlir::Pass> createMemDataFlowOptPass();
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index e107672adf907..2242cd2ff5595 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -35,6 +35,32 @@ def AbstractResultOpt
   ];
 }
 
+def SimplifyDoLoop : Pass<"simplify-fir-loop", "::mlir::func::FuncOp"> {
+  let summary = "Canonicalize fir.do_loop nests for affine promotion.";
+  let description = [{
+    General-purpose FIR loop canonicalization pass.  Transforms perfectly nested
+    fir.do_loop nests into a canonical form suitable for affine promotion and
+    loop optimizations (tiling, fusion, interchange, etc.).
+
+    Analysis phase (per nest):
+    1. Verifies each iter_arg is a shadow of the loop induction variable.
+    2. Builds an IV map: { loop -> (ivAlloca, lb, ub, step, ivType) }.
+    3. Checks safety: single IV store, no calls, IV doesn't escape.
+
+    Transformation phase:
+    1. Removes iter_args, init/final stores, and IV increment ops.
+    2. Forwards loads of the IV alloca to fir.convert(induction_var).
+    3. Emits final IV value stores after the outermost loop using the
+       Fortran formula: final = lb + ((ub - lb + step) / step) * step.
+  }];
+  let constructor = "::fir::createSimplifyDoLoopPass()";
+  let dependentDialects = [
+    "fir::FIROpsDialect", "mlir::func::FuncDialect",
+    "mlir::arith::ArithDialect"
+  ];
+}
+
+
 def AffineDialectPromotion : Pass<"promote-to-affine", "::mlir::func::FuncOp"> {
   let summary = "Promotes `fir.{do_loop,if}` to `affine.{for,if}`.";
   let description = [{
diff --git a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
index d461c1b9757b5..d137eb82054dd 100644
--- a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
+++ b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
@@ -62,6 +62,12 @@ cl::opt<bool> useOldAliasTags(
              "the FIR alias tags pass"),
     cl::init(false), cl::Hidden);
 EnableOption(FirLICM, "fir-licm", "FIR loop invariant code motion");
+EnableOption(AffineLoopOpt, "affine-loop-opt",
+             "affine loop optimizations (tiling, fusion, interchange)");
+cl::opt<unsigned> affineLoopOptTileSize(
+    "affine-loop-opt-tile-size",
+    cl::desc("tile size for affine loop tiling (0 = auto from cache model)"),
+    cl::init(0), cl::Hidden);
 
 /// CodeGen Passes
 DisableOption(CodeGenRewrite, "codegen-rewrite", "rewrite FIR for codegen");
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index bc95fcba512a4..34209fb89335f 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -12,7 +12,9 @@
 #include "flang/Optimizer/Passes/Pipelines.h"
 #include "flang/Optimizer/OpenACC/Passes.h"
 #include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Affine/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
+#include "mlir/Pass/PassRegistry.h"
 #include "llvm/Support/CommandLine.h"
 
 /// Force setting the no-alias attribute on fuction arguments when possible.
@@ -202,6 +204,54 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
   config.setRegionSimplificationLevel(
       mlir::GreedySimplifyRegionLevel::Disabled);
   pm.addPass(mlir::createCSEPass());
+
+  // Affine loop optimization pipeline (opt-in via --enable-affine-loop-opt).
+  if (enableAffineLoopOpt) {
+    pm.addPass(mlir::createCanonicalizerPass(config));
+    pm.addPass(mlir::createCSEPass());
+
+    addNestedPassToAllTopLevelOperations<PassConstructor>(
+        pm, fir::createSimplifyDoLoopPass);
+
+    pm.addPass(mlir::createCanonicalizerPass(config));
+    pm.addPass(mlir::createCSEPass());
+
+    pm.addPass(mlir::createLoopInvariantCodeMotionPass());
+    pm.addPass(fir::createLoopInvariantCodeMotion());
+
+    pm.addPass(mlir::createCanonicalizerPass(config));
+    pm.addPass(mlir::createCSEPass());
+
+    pm.addPass(fir::createPromoteToAffinePass());
+
+    // Use remove-dead-values instead of canonicalize between promotion and
+    // demotion to avoid folding fir.convert chains.  Canonicalize can merge
+    // a linearisation convert (ref<NxM> -> ref<N*M>) with the promotion
+    // convert (ref<N*M> -> memref<N*M>) into a single ref<NxM> -> memref<N*M>,
+    // which would cause a rank mismatch in AffineDemotion.
+    pm.addPass(mlir::createRemoveDeadValuesPass());
+    pm.addPass(mlir::createCSEPass());
+
+    if (affineLoopOptTileSize > 0) {
+      mlir::affine::registerAffineLoopTiling();
+      std::string pipeline = "func.func(affine-loop-tile{tile-size=" +
+                             std::to_string(affineLoopOptTileSize) + "})";
+      (void)mlir::parsePassPipeline(pipeline, pm);
+    } else {
+      pm.addNestedPass<mlir::func::FuncOp>(
+          mlir::affine::createLoopTilingPass());
+    }
+
+    pm.addPass(mlir::createRemoveDeadValuesPass());
+    pm.addPass(mlir::createCSEPass());
+
+    pm.addPass(fir::createAffineDemotionPass());
+    pm.addPass(mlir::createLowerAffinePass());
+
+    pm.addPass(mlir::createCanonicalizerPass(config));
+    pm.addPass(mlir::createCSEPass());
+  }
+
   fir::addAVC(pm, pc.OptLevel);
   addNestedPassToAllTopLevelOperations<PassConstructor>(
       pm, fir::createCharacterConversion);
diff --git a/flang/lib/Optimizer/Transforms/AffineDemotion.cpp b/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
index 430ef62a3a55d..4b5f025f4bf36 100644
--- a/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
@@ -22,6 +22,7 @@
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -33,6 +34,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include <algorithm>
 
 namespace fir {
 #define GEN_PASS_DEF_AFFINEDIALECTDEMOTION
@@ -46,6 +48,82 @@ using namespace mlir;
 
 namespace {
 
+/// Check whether the FIR base reference points to an array with
+/// dynamic (runtime-determined) extents, e.g. `!fir.ref<!fir.array<?x?xf32>>`.
+/// `fir.coordinate_of` cannot handle such arrays because it needs
+/// compile-time-known dimensions to linearise the multi-dimensional index.
+static bool baseHasDynamicExtents(mlir::Value base) {
+  mlir::Type ty = base.getType();
+  if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(ty))
+    ty = refTy.getEleTy();
+  else if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
+    ty = heapTy.getEleTy();
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
+    return seqTy.hasDynamicExtents();
+  return false;
+}
+
+/// Convert 0-based memref indices (already reversed to column-major order)
+/// to Fortran indices expected by fir.array_coor.
+///
+/// For fir.shape (implicit lb=1):    Fortran_idx = 0based + 1
+/// For fir.shape_shift (explicit lb): Fortran_idx = 0based + lb_k
+static SmallVector<Value>
+toFortranIndices(mlir::Value shape, ArrayRef<Value> zeroBasedIndices,
+                 mlir::Location loc, ConversionPatternRewriter &rewriter) {
+  SmallVector<Value> result;
+
+  if (auto shapeShiftOp = shape.getDefiningOp<fir::ShapeShiftOp>()) {
+    auto pairs = shapeShiftOp.getPairs();
+    for (unsigned k = 0; k < zeroBasedIndices.size(); ++k) {
+      mlir::Value lb = pairs[k * 2]; // lower bound for dimension k
+      result.push_back(
+          arith::AddIOp::create(rewriter, loc, zeroBasedIndices[k], lb));
+    }
+  } else {
+    // fir.shape or anything else — lower bound is 1
+    auto one = arith::ConstantIndexOp::create(rewriter, loc, 1);
+    for (auto idx : zeroBasedIndices)
+      result.push_back(arith::AddIOp::create(rewriter, loc, idx, one));
+  }
+  return result;
+}
+
+/// Walk backwards from `base` to locate the `fir.shape` (or shapeshift)
+/// that carries the runtime dimension sizes.
+///
+/// Handles three cases:
+///   1. Explicit-shape arrays: base is from fir.declare → shape is attached.
+///   2. Local allocatable arrays: base is from fir.allocmem → find the
+///      fir.embox that wraps it and recover the shape from there.
+///   3. Allocatable dummy / module arrays: base is from fir.box_addr →
+///      use the original fir.box directly with fir.array_coor (the box
+///      carries all shape info).  In this case `outBoxBase` is set to the
+///      box value and the returned shape is null.
+static mlir::Value findShapeForBase(mlir::Value base, mlir::Value &outBoxBase) {
+  outBoxBase = mlir::Value{};
+
+  // Case 1: explicit-shape via fir.declare
+  if (auto declareOp = base.getDefiningOp<fir::DeclareOp>())
+    return declareOp.getShape();
+
+  // Case 2: local allocatable — find fir.embox that wraps this heap pointer
+  if (base.getDefiningOp<fir::AllocMemOp>()) {
+    for (auto *user : base.getUsers()) {
+      if (auto embox = mlir::dyn_cast<fir::EmboxOp>(user))
+        return embox.getShape();
+    }
+  }
+
+  // Case 3: allocatable dummy arg / module array — base is from fir.box_addr
+  if (auto boxAddr = base.getDefiningOp<fir::BoxAddrOp>()) {
+    outBoxBase = boxAddr.getVal();
+    return mlir::Value{};
+  }
+
+  return mlir::Value{};
+}
+
 class AffineLoadConversion
     : public OpConversionPattern<mlir::affine::AffineLoadOp> {
 public:
@@ -60,12 +138,69 @@ class AffineLoadConversion
     if (!maybeExpandedMap)
       return failure();
 
-    auto coorOp = fir::CoordinateOp::create(
-        rewriter, op.getLoc(),
-        fir::ReferenceType::get(op.getResult().getType()), adaptor.getMemref(),
-        *maybeExpandedMap);
+    auto expandedIndices = *maybeExpandedMap;
 
-    rewriter.replaceOpWithNewOp<fir::LoadOp>(op, coorOp.getResult());
+    // AffinePromotion reverses dimension order (column-major FIR → row-major
+    // memref) and index order.  Reverse indices back for fir.coordinate_of
+    // which uses Fortran's column-major layout.
+    // ConvertConversion already strips the single fir.convert (FIR -> memref)
+    // that AffinePromotion created, so `base` is the original FIR value.
+    // Do NOT trace through any remaining fir.convert — those belong to the
+    // source IR (e.g. linearisation converts from -O2 whole-array lowering).
+    Value base = adaptor.getMemref();
+
+    auto hasSequenceType = [](mlir::Type ty) -> bool {
+      if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(ty))
+        return mlir::isa<fir::SequenceType>(refTy.getEleTy());
+      if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
+        return mlir::isa<fir::SequenceType>(heapTy.getEleTy());
+      return false;
+    };
+
+    if (!hasSequenceType(base.getType()))
+      return op.emitError(
+          "unsupported memref base: expected !fir.ref<!fir.array<...>> or "
+          "!fir.heap<!fir.array<...>>; fir.box and plain memref bases "
+          "are not yet handled by AffineDemotion");
+
+    std::reverse(expandedIndices.begin(), expandedIndices.end());
+
+    auto resultRefTy = fir::ReferenceType::get(op.getResult().getType());
+
+    if (baseHasDynamicExtents(base)) {
+      mlir::Value boxBase;
+      mlir::Value shape = findShapeForBase(base, boxBase);
+
+      if (shape) {
+        auto fortranIndices =
+            toFortranIndices(shape, expandedIndices, op.getLoc(), rewriter);
+        auto arrayCoorOp = fir::ArrayCoorOp::create(
+            rewriter, op.getLoc(), resultRefTy, base, shape,
+            /*slice=*/mlir::Value{}, fortranIndices,
+            /*typeparams=*/mlir::ValueRange{});
+        rewriter.replaceOpWithNewOp<fir::LoadOp>(op, arrayCoorOp.getResult());
+      } else if (boxBase) {
+        // Case 3: box carries shape — use box directly; lb=1 assumed
+        auto one = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 1);
+        SmallVector<Value> oneBasedIndices;
+        for (auto idx : expandedIndices)
+          oneBasedIndices.push_back(
+              arith::AddIOp::create(rewriter, op.getLoc(), idx, one));
+        auto arrayCoorOp = fir::ArrayCoorOp::create(
+            rewriter, op.getLoc(), resultRefTy, boxBase,
+            /*shape=*/mlir::Value{},
+            /*slice=*/mlir::Value{}, oneBasedIndices,
+            /*typeparams=*/mlir::ValueRange{});
+        rewriter.replaceOpWithNewOp<fir::LoadOp>(op, arrayCoorOp.getResult());
+      } else {
+        return op.emitError(
+            "cannot find shape or box for dynamic-extent array");
+      }
+    } else {
+      auto coorOp = fir::CoordinateOp::create(
+          rewriter, op.getLoc(), resultRefTy, base, expandedIndices);
+      rewriter.replaceOpWithNewOp<fir::LoadOp>(op, coorOp.getResult());
+    }
     return success();
   }
 };
@@ -84,12 +219,64 @@ class AffineStoreConversion
     if (!maybeExpandedMap)
       return failure();
 
-    auto coorOp = fir::CoordinateOp::create(
-        rewriter, op.getLoc(),
-        fir::ReferenceType::get(op.getValueToStore().getType()),
-        adaptor.getMemref(), *maybeExpandedMap);
-    rewriter.replaceOpWithNewOp<fir::StoreOp>(op, adaptor.getValue(),
-                                              coorOp.getResult());
+    auto expandedIndices = *maybeExpandedMap;
+
+    Value base = adaptor.getMemref();
+
+    auto hasSequenceType = [](mlir::Type ty) -> bool {
+      if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(ty))
+        return mlir::isa<fir::SequenceType>(refTy.getEleTy());
+      if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
+        return mlir::isa<fir::SequenceType>(heapTy.getEleTy());
+      return false;
+    };
+
+    if (!hasSequenceType(base.getType()))
+      return op.emitError(
+          "unsupported memref base: expected !fir.ref<!fir.array<...>> or "
+          "!fir.heap<!fir.array<...>>; fir.box and plain memref bases "
+          "are not yet handled by AffineDemotion");
+
+    std::reverse(expandedIndices.begin(), expandedIndices.end());
+
+    auto resultRefTy = fir::ReferenceType::get(op.getValueToStore().getType());
+
+    if (baseHasDynamicExtents(base)) {
+      mlir::Value boxBase;
+      mlir::Value shape = findShapeForBase(base, boxBase);
+
+      if (shape) {
+        auto fortranIndices =
+            toFortranIndices(shape, expandedIndices, op.getLoc(), rewriter);
+        auto arrayCoorOp = fir::ArrayCoorOp::create(
+            rewriter, op.getLoc(), resultRefTy, base, shape,
+            /*slice=*/mlir::Value{}, fortranIndices,
+            /*typeparams=*/mlir::ValueRange{});
+        rewriter.replaceOpWithNewOp<fir::StoreOp>(op, adaptor.getValue(),
+                                                  arrayCoorOp.getResult());
+      } else if (boxBase) {
+        auto one = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 1);
+        SmallVector<Value> oneBasedIndices;
+        for (auto idx : expandedIndices)
+          oneBasedIndices.push_back(
+              arith::AddIOp::create(rewriter, op.getLoc(), idx, one));
+        auto arrayCoorOp = fir::ArrayCoorOp::create(
+            rewriter, op.getLoc(), resultRefTy, boxBase,
+            /*shape=*/mlir::Value{},
+            /*slice=*/mlir::Value{}, oneBasedIndices,
+            /*typeparams=*/mlir::ValueRange{});
+        rewriter.replaceOpWithNewOp<fir::StoreOp>(op, adaptor.getValue(),
+                                                  arrayCoorOp.getResult());
+      } else {
+        return op.emitError(
+            "cannot find shape or box for dynamic-extent array");
+      }
+    } else {
+      auto coorOp = fir::CoordinateOp::create(
+          rewriter, op.getLoc(), resultRefTy, base, expandedIndices);
+      rewriter.replaceOpWithNewOp<fir::StoreOp>(op, adaptor.getValue(),
+                                                coorOp.getResult());
+    }
     return success();
   }
 };
@@ -101,22 +288,18 @@ class ConvertConversion : public mlir::OpRewritePattern<fir::ConvertOp> {
   matchAndRewrite(fir::ConvertOp op,
                   mlir::PatternRewriter &rewriter) const override {
     if (mlir::isa<mlir::MemRefType>(op.getRes().getType())) {
-      // due to index calculation moving to affine maps we still need to
-      // add converts for sequence types this has a side effect of losing
-      // some information about arrays with known dimensions by creating:
-      // fir.convert %arg0 : (!fir.ref<!fir.array<5xi32>>) ->
-      // !fir.ref<!fir.array<?xi32>>
-      if (auto refTy =
-              mlir::dyn_cast<fir::ReferenceType>(op.getValue().getType()))
-        if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(refTy.getEleTy())) {
-          fir::SequenceType::Shape flatShape = {
-              fir::SequenceType::getUnknownExtent()};
-          auto flatArrTy = fir::SequenceType::get(flatShape, arrTy.getEleTy());
-          auto flatTy = fir::ReferenceType::get(flatArrTy);
-          rewriter.replaceOpWithNewOp<fir::ConvertOp>(op, flatTy,
-                                                      op.getValue());
-          return success();
-        }
+      mlir::Type srcTy = op.getValue().getType();
+      auto getSeqTy = [](mlir::Type t) -> fir::SequenceType {
+        if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(t))
+          return mlir::dyn_cast<fir::SequenceType>(refTy.getEleTy());
+        if (auto heapTy = mlir::dyn_cast<fir::HeapType>(t))
+          return mlir::dyn_cast<fir::SequenceType>(heapTy.getEleTy());
+        return {};
+      };
+      if (getSeqTy(srcTy)) {
+        rewriter.replaceOp(op, op.getValue());
+        return success();
+      }
       rewriter.replaceOp(op, op.getValue());
     }
     return success();
diff --git a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
index bdc34186a713b..da9364e62682d 100644
--- a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
@@ -63,22 +63,117 @@ struct AffineFunctionAnalysis {
 };
 } // namespace
 
-static bool analyzeCoordinate(mlir::Value coordinate, mlir::Operation *op) {
-  if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(coordinate)) {
-    if (isa<fir::DoLoopOp>(blockArg.getOwner()->getParentOp()))
-      return true;
-    LLVM_DEBUG(llvm::dbgs() << "AffineLoopAnalysis: array coordinate is not a "
-                               "loop induction variable (owner not loopOp)\n";
-               op->dump());
+/// Recursively checks whether a value can be expressed as an affine function
+/// of loop induction variables and integer constants.  Walks through
+/// fir.convert (type-cast), arith.addi, arith.subi, and arith.muli (the
+/// latter only when at least one operand is a compile-time constant so the
+/// result stays within MLIR's strict affine expression rules).
+static bool isAffineIndex(mlir::Value val, unsigned depth = 0) {
+  if (depth > 16)
+    return false;
+
+  if (auto conv = val.getDefiningOp<fir::ConvertOp>())
+    return isAffineIndex(conv.getValue(), depth + 1);
+
+  if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(val))
+    return isa<fir::DoLoopOp>(blockArg.getOwner()->getParentOp()) ||
+           isa<mlir::affine::AffineForOp>(blockArg.getOwner()->getParentOp());
+
+  auto *defOp = val.getDefiningOp();
+  if (!defOp)
+    return false;
+
+  if (isa<mlir::arith::ConstantOp>(defOp))
+    return true;
+
+  if (auto add = dyn_cast<mlir::arith::AddIOp>(defOp))
+    return isAffineIndex(add.getLhs(), depth + 1) &&
+           isAffineIndex(add.getRhs(), depth + 1);
+
+  if (auto sub = dyn_cast<mlir::arith::SubIOp>(defOp))
+    return isAffineIndex(sub.getLhs(), depth + 1) &&
+           isAffineIndex(sub.getRhs(), depth + 1);
+
+  if (auto mul = dyn_cast<mlir::arith::MulIOp>(defOp)) {
+    auto *lhsDef = mul.getLhs().getDefiningOp();
+    auto *rhsDef = mul.getRhs().getDefiningOp();
+    if ((lhsDef && isa<mlir::arith::ConstantOp>(lhsDef)) ||
+        (rhsDef && isa<mlir::arith::ConstantOp>(rhsDef)))
+      return isAffineIndex(mul.getLhs(), depth + 1) &&
+             isAffineIndex(mul.getRhs(), depth + 1);
     return false;
   }
-  LLVM_DEBUG(
-      llvm::dbgs() << "AffineLoopAnalysis: array coordinate is not a loop "
-                      "induction variable (not a block argument)\n";
-      op->dump(); coordinate.getDefiningOp()->dump());
+
+  LLVM_DEBUG(llvm::dbgs() << "AffineLoopAnalysis: index is not an affine "
+                             "expression of loop IVs\n";
+             defOp->dump());
   return false;
 }
 
+/// Builds an mlir::AffineExpr by recursively walking the FIR/arith expression
+/// tree rooted at a fir.array_coor index value.  Loop induction variables
+/// become affine dimensions; integer constants are folded into the expression.
+struct AffineIndexBuilder {
+  using MaybeExpr = std::optional<mlir::AffineExpr>;
+
+  explicit AffineIndexBuilder(mlir::MLIRContext *ctx) : context(ctx) {}
+
+  MaybeExpr build(mlir::Value val) {
+    if (auto conv = val.getDefiningOp<fir::ConvertOp>())
+      return build(conv.getValue());
+
+    if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(val)) {
+      if (isa<fir::DoLoopOp>(blockArg.getOwner()->getParentOp()) ||
+          isa<mlir::affine::AffineForOp>(blockArg.getOwner()->getParentOp())) {
+        for (unsigned i = 0; i < dims.size(); ++i)
+          if (dims[i] == val)
+            return mlir::getAffineDimExpr(i, context);
+        unsigned idx = dims.size();
+        dims.push_back(val);
+        return mlir::getAffineDimExpr(idx, context);
+      }
+      return {};
+    }
+
+    auto *defOp = val.getDefiningOp();
+    if (!defOp)
+      return {};
+
+    if (auto op = dyn_cast<mlir::arith::ConstantOp>(defOp))
+      if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(op.getValue()))
+        return mlir::getAffineConstantExpr(intAttr.getInt(), context);
+
+    if (auto op = dyn_cast<mlir::arith::AddIOp>(defOp)) {
+      auto lhs = build(op.getLhs());
+      auto rhs = build(op.getRhs());
+      if (lhs && rhs)
+        return *lhs + *rhs;
+      return {};
+    }
+
+    if (auto op = dyn_cast<mlir::arith::SubIOp>(defOp)) {
+      auto lhs = build(op.getLhs());
+      auto rhs = build(op.getRhs());
+      if (lhs && rhs)
+        return *lhs - *rhs;
+      return {};
+    }
+
+    if (auto op = dyn_cast<mlir::arith::MulIOp>(defOp)) {
+      auto lhs = build(op.getLhs());
+      auto rhs = build(op.getRhs());
+      if (lhs && rhs)
+        return *lhs * *rhs;
+      return {};
+    }
+
+    return {};
+  }
+
+  mlir::MLIRContext *context;
+  llvm::SmallVector<mlir::Value> dims;
+};
+
 namespace {
 struct AffineLoopAnalysis {
   AffineLoopAnalysis() = default;
@@ -134,7 +229,7 @@ struct AffineLoopAnalysis {
       }
       bool canPromote = true;
       for (auto coordinate : acoOp.getIndices())
-        canPromote = canPromote && analyzeCoordinate(coordinate, op);
+        canPromote = canPromote && isAffineIndex(coordinate);
       return canPromote;
     }
     if (auto coOp = memref.getDefiningOp<CoordinateOp>()) {
@@ -322,27 +417,6 @@ AffineFunctionAnalysis::getChildIfAnalysis(fir::IfOp op) const {
   return it->getSecond();
 }
 
-/// AffineMap rewriting fir.array_coor operation to affine apply,
-/// %dim = fir.gendim %lowerBound, %upperBound, %stride
-/// %a = fir.array_coor %arr(%dim) %i
-/// returning affineMap = affine_map<(i)[lb, ub, st] -> (i*st - lb)>
-static mlir::AffineMap createArrayIndexAffineMap(unsigned dimensions,
-                                                 MLIRContext *context) {
-  auto index = mlir::getAffineConstantExpr(0, context);
-  auto accuExtent = mlir::getAffineConstantExpr(1, context);
-  for (unsigned i = 0; i < dimensions; ++i) {
-    mlir::AffineExpr idx = mlir::getAffineDimExpr(i, context),
-                     lowerBound = mlir::getAffineSymbolExpr(i * 3, context),
-                     currentExtent =
-                         mlir::getAffineSymbolExpr(i * 3 + 1, context),
-                     stride = mlir::getAffineSymbolExpr(i * 3 + 2, context),
-                     currentPart = (idx * stride - lowerBound) * accuExtent;
-    index = currentPart + index;
-    accuExtent = accuExtent * currentExtent;
-  }
-  return mlir::AffineMap::get(dimensions, dimensions * 3, index);
-}
-
 static std::optional<int64_t> constantIntegerLike(const mlir::Value value) {
   if (auto definition = value.getDefiningOp<mlir::arith::ConstantOp>())
     if (auto stepAttr = mlir::dyn_cast<IntegerAttr>(definition.getValue()))
@@ -350,104 +424,111 @@ static std::optional<int64_t> constantIntegerLike(const mlir::Value value) {
   return {};
 }
 
-static mlir::Type coordinateArrayElement(fir::ArrayCoorOp op) {
-  if (auto refType =
-          mlir::dyn_cast_or_null<ReferenceType>(op.getMemref().getType())) {
-    if (auto seqType =
-            mlir::dyn_cast_or_null<SequenceType>(refType.getEleTy())) {
-      return seqType.getEleTy();
-    }
-  }
-  op.emitError(
-      "AffineLoopConversion: array type in coordinate operation not valid\n");
-  return mlir::Type();
-}
-
-static void populateIndexArgs(fir::ArrayCoorOp acoOp, fir::ShapeOp shape,
-                              SmallVectorImpl<mlir::Value> &indexArgs,
-                              mlir::PatternRewriter &rewriter) {
-  auto one = mlir::arith::ConstantOp::create(rewriter, acoOp.getLoc(),
-                                             rewriter.getIndexType(),
-                                             rewriter.getIndexAttr(1));
-  auto extents = shape.getExtents();
-  for (auto i = extents.begin(); i < extents.end(); i++) {
-    indexArgs.push_back(one);
-    indexArgs.push_back(*i);
-    indexArgs.push_back(one);
-  }
-}
+/// Holds the result of creating multi-dimensional affine operations.
+struct MultiDimAffineResult {
+  SmallVector<mlir::Value> indices;
+  fir::ConvertOp arrayConvert;
+};
 
-static void populateIndexArgs(fir::ArrayCoorOp acoOp, fir::ShapeShiftOp shape,
-                              SmallVectorImpl<mlir::Value> &indexArgs,
-                              mlir::PatternRewriter &rewriter) {
-  auto one = mlir::arith::ConstantOp::create(rewriter, acoOp.getLoc(),
-                                             rewriter.getIndexType(),
-                                             rewriter.getIndexAttr(1));
-  auto extents = shape.getPairs();
-  for (auto i = extents.begin(); i < extents.end();) {
-    indexArgs.push_back(*i++);
-    indexArgs.push_back(*i++);
-    indexArgs.push_back(one);
-  }
-}
+/// Creates multi-dimensional affine operations preserving array dimensionality.
+/// Instead of linearizing all indices into a single 1D offset, this extracts
+/// the array shape from the FIR SequenceType, creates a matching multi-dim
+/// MemRefType, and adjusts each per-dimension index from Fortran 1-based to
+/// memref 0-based indexing.
+static MultiDimAffineResult
+createMultiDimAffineOps(mlir::Value arrayRef, mlir::PatternRewriter &rewriter) {
+  auto acoOp = arrayRef.getDefiningOp<ArrayCoorOp>();
+  auto loc = acoOp.getLoc();
+  auto *context = acoOp.getContext();
 
-static void populateIndexArgs(fir::ArrayCoorOp acoOp, fir::SliceOp slice,
-                              SmallVectorImpl<mlir::Value> &indexArgs,
-                              mlir::PatternRewriter &rewriter) {
-  auto extents = slice.getTriples();
-  for (auto i = extents.begin(); i < extents.end();) {
-    indexArgs.push_back(*i++);
-    indexArgs.push_back(*i++);
-    indexArgs.push_back(*i++);
+  fir::SequenceType seqType;
+  if (auto refType =
+          mlir::dyn_cast<fir::ReferenceType>(acoOp.getMemref().getType()))
+    seqType = mlir::dyn_cast<fir::SequenceType>(refType.getEleTy());
+  else if (auto heapType =
+               mlir::dyn_cast<fir::HeapType>(acoOp.getMemref().getType()))
+    seqType = mlir::dyn_cast<fir::SequenceType>(heapType.getEleTy());
+
+  // need change because memref is row major order but fir.array is column major
+  // order]=
+  SmallVector<int64_t> reversedShape(seqType.getShape().rbegin(),
+                                     seqType.getShape().rend());
+
+  auto newType = mlir::MemRefType::get(reversedShape, seqType.getEleTy());
+  auto arrayConvert =
+      fir::ConvertOp::create(rewriter, loc, newType, acoOp.getMemref());
+
+  SmallVector<mlir::Value> adjustedIndices;
+  auto indices = acoOp.getIndices();
+
+  if (auto shapeOp = acoOp.getShape().getDefiningOp<ShapeOp>()) {
+    for (auto idx : indices) {
+      AffineIndexBuilder builder(context);
+      auto expr = builder.build(idx);
+      assert(expr && "analysis guaranteed index is affine");
+      auto adjustedExpr = *expr - 1;
+      auto map = mlir::AffineMap::get(builder.dims.size(), 0, adjustedExpr);
+      auto adjusted =
+          affine::AffineApplyOp::create(rewriter, loc, map, builder.dims);
+      adjustedIndices.push_back(adjusted.getResult());
+    }
+  } else if (auto shapeShiftOp =
+                 acoOp.getShape().getDefiningOp<ShapeShiftOp>()) {
+    auto pairs = shapeShiftOp.getPairs();
+    for (unsigned i = 0; i < indices.size(); ++i) {
+      AffineIndexBuilder builder(context);
+      auto expr = builder.build(indices[i]);
+      assert(expr && "analysis guaranteed index is affine");
+      auto adjustedExpr = *expr - mlir::getAffineSymbolExpr(0, context);
+      auto map = mlir::AffineMap::get(builder.dims.size(), 1, adjustedExpr);
+      SmallVector<mlir::Value> operands;
+      operands.append(builder.dims.begin(), builder.dims.end());
+      operands.push_back(pairs[i * 2]);
+      auto adjusted =
+          affine::AffineApplyOp::create(rewriter, loc, map, operands);
+      adjustedIndices.push_back(adjusted.getResult());
+    }
+  } else if (auto sliceOp = acoOp.getShape().getDefiningOp<SliceOp>()) {
+    auto triples = sliceOp.getTriples();
+    for (unsigned i = 0; i < indices.size(); ++i) {
+      AffineIndexBuilder builder(context);
+      auto expr = builder.build(indices[i]);
+      assert(expr && "analysis guaranteed index is affine");
+      auto lbSym = mlir::getAffineSymbolExpr(0, context);
+      auto strideSym = mlir::getAffineSymbolExpr(1, context);
+      auto adjustedExpr = (*expr - lbSym).floorDiv(strideSym);
+      auto map = mlir::AffineMap::get(builder.dims.size(), 2, adjustedExpr);
+      SmallVector<mlir::Value> operands;
+      operands.append(builder.dims.begin(), builder.dims.end());
+      operands.push_back(triples[i * 3]);
+      operands.push_back(triples[i * 3 + 2]);
+      auto adjusted =
+          affine::AffineApplyOp::create(rewriter, loc, map, operands);
+      adjustedIndices.push_back(adjusted.getResult());
+    }
   }
-}
 
-static void populateIndexArgs(fir::ArrayCoorOp acoOp,
-                              SmallVectorImpl<mlir::Value> &indexArgs,
-                              mlir::PatternRewriter &rewriter) {
-  if (auto shape = acoOp.getShape().getDefiningOp<ShapeOp>())
-    return populateIndexArgs(acoOp, shape, indexArgs, rewriter);
-  if (auto shapeShift = acoOp.getShape().getDefiningOp<ShapeShiftOp>())
-    return populateIndexArgs(acoOp, shapeShift, indexArgs, rewriter);
-  if (auto slice = acoOp.getShape().getDefiningOp<SliceOp>())
-    return populateIndexArgs(acoOp, slice, indexArgs, rewriter);
-}
+  // need reverse because memref is row major order but fir.array is column
+  // major order
+  std::reverse(adjustedIndices.begin(), adjustedIndices.end());
 
-/// Returns affine.apply and fir.convert from array_coor and gendims
-static std::pair<affine::AffineApplyOp, fir::ConvertOp>
-createAffineOps(mlir::Value arrayRef, mlir::PatternRewriter &rewriter) {
-  auto acoOp = arrayRef.getDefiningOp<ArrayCoorOp>();
-  auto affineMap =
-      createArrayIndexAffineMap(acoOp.getIndices().size(), acoOp.getContext());
-  SmallVector<mlir::Value> indexArgs;
-  indexArgs.append(acoOp.getIndices().begin(), acoOp.getIndices().end());
-
-  populateIndexArgs(acoOp, indexArgs, rewriter);
-
-  auto affineApply = affine::AffineApplyOp::create(rewriter, acoOp.getLoc(),
-                                                   affineMap, indexArgs);
-  auto arrayElementType = coordinateArrayElement(acoOp);
-  auto newType =
-      mlir::MemRefType::get({mlir::ShapedType::kDynamic}, arrayElementType);
-  auto arrayConvert = fir::ConvertOp::create(rewriter, acoOp.getLoc(), newType,
-                                             acoOp.getMemref());
-  return std::make_pair(affineApply, arrayConvert);
+  return {std::move(adjustedIndices), arrayConvert};
 }
 
 static void rewriteLoad(fir::LoadOp loadOp, mlir::PatternRewriter &rewriter) {
   rewriter.setInsertionPoint(loadOp);
-  auto affineOps = createAffineOps(loadOp.getMemref(), rewriter);
+  auto result = createMultiDimAffineOps(loadOp.getMemref(), rewriter);
   rewriter.replaceOpWithNewOp<affine::AffineLoadOp>(
-      loadOp, affineOps.second.getResult(), affineOps.first.getResult());
+      loadOp, result.arrayConvert.getResult(), result.indices);
 }
 
 static void rewriteStore(fir::StoreOp storeOp,
                          mlir::PatternRewriter &rewriter) {
   rewriter.setInsertionPoint(storeOp);
-  auto affineOps = createAffineOps(storeOp.getMemref(), rewriter);
+  auto result = createMultiDimAffineOps(storeOp.getMemref(), rewriter);
   rewriter.replaceOpWithNewOp<affine::AffineStoreOp>(
-      storeOp, storeOp.getValue(), affineOps.second.getResult(),
-      affineOps.first.getResult());
+      storeOp, storeOp.getValue(), result.arrayConvert.getResult(),
+      result.indices);
 }
 
 static void rewriteMemoryOps(Block *block, mlir::PatternRewriter &rewriter) {
@@ -478,6 +559,19 @@ class AffineLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
         functionAnalysis.getChildLoopAnalysis(loop);
     if (!loopAnalysis.canPromoteToAffine())
       return rewriter.notifyMatchFailure(loop, "cannot promote to affine");
+
+    // All enclosing fir.do_loop ops must also be promotable.  Otherwise
+    // this loop's affine operations would reference fir.do_loop block args
+    // (not affine.for IVs) as dimension ids, which is invalid.
+    for (auto *parent = loop->getParentOp(); parent;
+         parent = parent->getParentOp()) {
+      if (auto parentLoop = dyn_cast<fir::DoLoopOp>(parent)) {
+        auto parentAnalysis = functionAnalysis.getChildLoopAnalysis(parentLoop);
+        if (!parentAnalysis.canPromoteToAffine())
+          return rewriter.notifyMatchFailure(
+              loop, "enclosing fir.do_loop is not promotable");
+      }
+    }
     auto &loopOps = loop.getBody()->getOperations();
     auto resultOp = cast<fir::ResultOp>(loop.getBody()->getTerminator());
     auto results = resultOp.getOperands();
@@ -525,18 +619,39 @@ class AffineLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
     return genericBounds(op, rewriter);
   }
 
+  /// Build an AffineMap + operands for a single loop bound using
+  /// AffineIndexBuilder.  Reuses the same recursive decomposition used for
+  /// array indices: fir.convert, arith.addi/subi/muli, constants, and
+  /// enclosing loop IVs are all handled uniformly.
+  ///
+  /// If the bound is an affine expression of enclosing loop IVs and
+  /// constants, those IVs become dimensions in the map (as required by the
+  /// affine verifier).  Otherwise the raw value is treated as a symbol.
+  static mlir::AffineMap boundMap(mlir::Value operand, int64_t offset,
+                                  mlir::MLIRContext *ctx,
+                                  SmallVectorImpl<mlir::Value> &mapOperands) {
+    AffineIndexBuilder builder(ctx);
+    if (auto expr = builder.build(operand)) {
+      mapOperands.append(builder.dims.begin(), builder.dims.end());
+      return mlir::AffineMap::get(builder.dims.size(), /*symbolCount=*/0,
+                                  *expr + offset);
+    }
+    mapOperands.push_back(operand);
+    return mlir::AffineMap::get(/*dimCount=*/0, /*symbolCount=*/1,
+                                mlir::getAffineSymbolExpr(0, ctx) + offset);
+  }
+
   // when step for the loop is positive compile time constant
   std::pair<affine::AffineForOp, mlir::Value>
   positiveConstantStep(fir::DoLoopOp op, int64_t step,
                        mlir::PatternRewriter &rewriter) const {
+    auto *ctx = op.getContext();
+    SmallVector<mlir::Value> lbOperands, ubOperands;
+    auto lbMap = boundMap(op.getLowerBound(), 0, ctx, lbOperands);
+    auto ubMap = boundMap(op.getUpperBound(), 1, ctx, ubOperands);
     auto affineFor = affine::AffineForOp::create(
-        rewriter, op.getLoc(), ValueRange(op.getLowerBound()),
-        mlir::AffineMap::get(0, 1,
-                             mlir::getAffineSymbolExpr(0, op.getContext())),
-        ValueRange(op.getUpperBound()),
-        mlir::AffineMap::get(0, 1,
-                             1 + mlir::getAffineSymbolExpr(0, op.getContext())),
-        step, op.getIterOperands());
+        rewriter, op.getLoc(), lbOperands, lbMap, ubOperands, ubMap, step,
+        op.getIterOperands());
     return std::make_pair(affineFor, affineFor.getInductionVar());
   }
 
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 5a3059ebbd97f..ab64371e756f0 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_flang_library(FIRTransforms
   AddDebugInfo.cpp
   AffineDemotion.cpp
   AffinePromotion.cpp
+  SimplifyDoLoop.cpp
   AlgebraicSimplification.cpp
   AnnotateConstant.cpp
   ArrayValueCopy.cpp
diff --git a/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
new file mode 100644
index 0000000000000..12f3c5a9a1f91
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
@@ -0,0 +1,639 @@
+//===-- SimplifyDoLoop.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// General-purpose FIR loop canonicalization pass.
+//
+// Transforms fir.do_loop nests into a canonical form suitable for affine
+// promotion and loop optimizations (tiling, fusion, interchange, etc.).
+//
+// The canonical form has:
+//   - No iter_args (shadow induction variable copies removed)
+//   - No memory-based IV tracking inside the loop body
+//   - Final IV values computed and stored after the outermost loop
+//
+// === Design Overview ===
+//
+// Analysis phase (per loop nest):
+//   1. Collect perfectly nested fir.do_loop chain.
+//   2. For each loop, verify iter_arg is a shadow of the induction variable:
+//      - init = fir.convert(lower_bound)
+//      - yield = arith.addi(iter_arg_or_load_of_iv, fir.convert(step))
+//   3. Verify safety conditions:
+//      a. Only one store to IV alloca inside loop (the init store of iter_arg)
+//      b. No function/subroutine calls in the nest
+//      c. IV alloca does not escape (only load/store/declare users)
+//      d. Loop results are only used for final IV stores
+//
+// Transformation phase:
+//   1. For each loop (innermost first):
+//      a. Remove the initial store (fir.store %iter_arg to %iv_alloca)
+//      b. Forward all loads of IV alloca inside loop body to fir.convert(IV)
+//      todo: the forwarding of load of iv alloca can be done by some other pass
+//      like fir-memref-dataflow-opt pass (if it is available). 
+//      c. Strip iter_args and fir.result, rebuild as simple fir.do_loop
+//   2. After the outermost loop, compute and store final IV values
+//      for all loops whose IV is live after the loop (outer to inner order).
+//      Fortran final value: final_iv = lb + ((ub - lb + step) / step) * step
+//      which equals the value of the iter_arg after the last increment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+
+namespace fir {
+#define GEN_PASS_DEF_SIMPLIFYDOLOOP
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+#define DEBUG_TYPE "simplify-do-loop"
+
+using namespace fir;
+using namespace mlir;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Per-loop bookkeeping built during analysis
+//===----------------------------------------------------------------------===//
+
+struct LoopIVInfo {
+  fir::DoLoopOp loop;
+  Value ivAlloca;                  // fir.alloca for this loop's IV
+  SmallVector<Value, 2> ivAliases; // ivAlloca + any fir.declare alias
+  Value lowerBound;                // index-typed lower bound
+  Value upperBound;                // index-typed upper bound
+  Value step;                      // index-typed step
+  Type ivType;                     // Fortran IV type (e.g. i32)
+};
+
+//===----------------------------------------------------------------------===//
+// Helpers
+//===----------------------------------------------------------------------===//
+
+/// Collect the IV memory reference and all its aliases (the raw fir.alloca
+/// and any fir.declare results that alias it).  `ivRef` may be either the
+/// alloca itself or a declare result — we normalise to the underlying alloca
+/// first, then collect all declare aliases from it.
+static SmallVector<Value, 2> collectAliases(Value ivRef) {
+  SmallVector<Value, 2> aliases;
+
+  // If ivRef is a declare result, trace back to the underlying alloca.
+  Value underlying = ivRef;
+  if (auto decl = ivRef.getDefiningOp<fir::DeclareOp>())
+    underlying = decl.getMemref();
+
+  aliases.push_back(underlying);
+  for (auto *user : underlying.getUsers())
+    if (auto decl = dyn_cast<fir::DeclareOp>(user))
+      aliases.push_back(decl.getResult());
+
+  return aliases;
+}
+
+/// Collect a perfectly nested chain of fir.do_loop ops starting from `outer`.
+/// A loop is considered perfectly nested if between each nesting level only
+/// IV-related operations (stores, converts) and the inner loop exist.
+static SmallVector<fir::DoLoopOp> collectNest(fir::DoLoopOp outer) {
+  SmallVector<fir::DoLoopOp> nest;
+  fir::DoLoopOp cur = outer;
+  while (cur) {
+    nest.push_back(cur);
+    fir::DoLoopOp inner;
+    unsigned loopCount = 0;
+    for (auto &op : cur.getBody()->getOperations())
+      if (auto nested = dyn_cast<fir::DoLoopOp>(op)) {
+        inner = nested;
+        ++loopCount;
+      }
+    if (loopCount != 1)
+      break;
+    cur = inner;
+  }
+  return nest;
+}
+
+/// Strip fir.convert chains to find the root SSA value.
+static Value stripConverts(Value val) {
+  while (auto conv = val.getDefiningOp<fir::ConvertOp>())
+    val = conv.getValue();
+  return val;
+}
+
+/// Check whether `val` originates from `target` (possibly through fir.convert).
+static bool originatesFrom(Value val, Value target) {
+  return stripConverts(val) == target;
+}
+
+/// Find IV alloca: the first fir.store in the loop body whose value
+/// originates from the iter_arg or the induction variable (possibly through
+/// fir.convert chains).
+// ***** We scan the entire top-level body rather than
+/// stopping at an inner fir.do_loop so that the pass remains robust if
+/// upstream passes reorder operations.
+static Value findIVAlloca(fir::DoLoopOp loop) {
+  if (!loop.hasIterOperands() || loop.getNumIterOperands() < 1)
+    return {};
+  auto iterArg = loop.getRegionIterArgs()[0];
+  auto iv = loop.getInductionVar();
+  for (auto &op : loop.getBody()->getOperations()) {
+    if (auto store = dyn_cast<fir::StoreOp>(op)) {
+      Value stored = store.getValue();
+      if (originatesFrom(stored, iterArg) || originatesFrom(stored, iv))
+        return store.getMemref();
+    }
+  }
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+//                          ANALYSIS PHASE
+//===----------------------------------------------------------------------===//
+
+// ---- Analysis 1: Confirm iter_arg is a shadow of the induction variable ----
+//
+// The iter_arg must mirror the index-typed induction variable:
+//   init  = fir.convert(lower_bound) : (index) -> i32
+//   yield = arith.addi(iter_arg_or_load_of_iv, fir.convert(step))
+
+static bool isShadowIV(fir::DoLoopOp loop, Value ivAlloca) {
+  auto iterOperands = loop.getIterOperands();
+  auto iterArg = loop.getRegionIterArgs()[0];
+
+  auto initConvert = iterOperands[0].getDefiningOp<fir::ConvertOp>();
+  if (!initConvert || initConvert.getValue() != loop.getLowerBound()) {
+    LLVM_DEBUG(llvm::dbgs() << "  [shadow] init is not fir.convert(lb)\n");
+    return false;
+  }
+
+  auto resultOp = cast<fir::ResultOp>(loop.getBody()->getTerminator());
+  auto addOp = resultOp.getOperand(0).getDefiningOp<arith::AddIOp>();
+  if (!addOp) {
+    LLVM_DEBUG(llvm::dbgs() << "  [shadow] yield is not arith.addi\n");
+    return false;
+  }
+
+  auto isIVValue = [&](Value v) -> bool {
+    if (v == iterArg)
+      return true;
+    if (auto load = v.getDefiningOp<fir::LoadOp>()) {
+      if (load.getMemref() == ivAlloca)
+        return true;
+      if (auto decl = load.getMemref().getDefiningOp<fir::DeclareOp>())
+        if (decl.getMemref() == ivAlloca)
+          return true;
+    }
+    return false;
+  };
+
+  Value stepSide;
+  if (isIVValue(addOp.getLhs()))
+    stepSide = addOp.getRhs();
+  else if (isIVValue(addOp.getRhs()))
+    stepSide = addOp.getLhs();
+  else {
+    LLVM_DEBUG(llvm::dbgs() << "  [shadow] addi doesn't use iter_arg/IV\n");
+    return false;
+  }
+
+  auto stepConvert = stepSide.getDefiningOp<fir::ConvertOp>();
+  if (!stepConvert || stepConvert.getValue() != loop.getStep()) {
+    LLVM_DEBUG(llvm::dbgs() << "  [shadow] step operand mismatch\n");
+    return false;
+  }
+  return true;
+}
+
+// ---- Analysis 2: Only one store to IV alloca inside loop (the init store) --
+
+static bool singleStoreToIVAlloca(fir::DoLoopOp loop,
+                                  ArrayRef<Value> ivAliases) {
+  auto iterArg = loop.getRegionIterArgs()[0];
+  auto iv = loop.getInductionVar();
+  bool foundInit = false;
+  bool ok = true;
+
+  loop.walk([&](fir::StoreOp store) {
+    if (!llvm::is_contained(ivAliases, store.getMemref()))
+      return;
+    if (!foundInit && (originatesFrom(store.getValue(), iterArg) ||
+                       originatesFrom(store.getValue(), iv))) {
+      foundInit = true;
+      return;
+    }
+    LLVM_DEBUG(llvm::dbgs()
+               << "  [store] extra store to IV: " << store << "\n");
+    ok = false;
+  });
+  return ok;
+}
+
+// ---- Analysis 3: No function/subroutine calls in the nest -----------------
+
+static bool noCallsInNest(fir::DoLoopOp outermost) {
+  bool ok = true;
+  outermost.walk([&](Operation *op) {
+    if (isa<fir::CallOp>(op) || isa<func::CallOp>(op) ||
+        isa<fir::DispatchOp>(op)) {
+      LLVM_DEBUG(llvm::dbgs() << "  [call] found: " << *op << "\n");
+      ok = false;
+    }
+  });
+  return ok;
+}
+
+// ---- Analysis 4: IV alloca must not escape --------------------------------
+
+static bool ivDoesNotEscape(ArrayRef<Value> ivAliases) {
+  for (auto alias : ivAliases)
+    for (auto *user : alias.getUsers())
+      if (!isa<fir::StoreOp, fir::LoadOp, fir::DeclareOp>(user)) {
+        LLVM_DEBUG(llvm::dbgs() << "  [escape] IV escapes: " << *user << "\n");
+        return false;
+      }
+  return true;
+}
+
+// ---- Full nest analysis ---------------------------------------------------
+
+static bool analyzeNest(SmallVector<LoopIVInfo> &infos) {
+  // --- Per-loop: shadow-IV check, IV alloca discovery, single-store check ---
+  for (auto &info : infos) {
+    auto loop = info.loop;
+    if (!loop.hasIterOperands() || loop.getNumIterOperands() != 1) {
+      LLVM_DEBUG(llvm::dbgs() << "  skip: loop has != 1 iter_args at "
+                              << loop.getLoc() << "\n");
+      return false;
+    }
+
+    info.ivAlloca = findIVAlloca(loop);
+    if (!info.ivAlloca) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  cannot find IV alloca at " << loop.getLoc() << "\n");
+      return false;
+    }
+
+    info.ivAliases = collectAliases(info.ivAlloca);
+
+    if (!isShadowIV(loop, info.ivAlloca)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  not shadow IV at " << loop.getLoc() << "\n");
+      return false;
+    }
+
+    if (!singleStoreToIVAlloca(loop, info.ivAliases)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  multiple stores at " << loop.getLoc() << "\n");
+      return false;
+    }
+
+    // Record loop bounds and IV type from the iter_arg init value.
+    info.lowerBound = loop.getLowerBound();
+    info.upperBound = loop.getUpperBound();
+    info.step = loop.getStep();
+    info.ivType = loop.getIterOperands()[0].getType();
+  }
+
+  // --- No function calls in the nest ---
+  if (!noCallsInNest(infos.front().loop))
+    return false;
+
+  // --- IV alloca must not escape ---
+  for (auto &info : infos) {
+    if (!ivDoesNotEscape(info.ivAliases))
+      return false;
+  }
+
+  // --- Loop results must only be used for final IV stores ---
+  for (auto &info : infos) {
+    for (auto result : info.loop.getResults()) {
+      for (auto *user : result.getUsers()) {
+        auto store = dyn_cast<fir::StoreOp>(user);
+        if (!store || !llvm::is_contained(info.ivAliases, store.getMemref())) {
+          LLVM_DEBUG(llvm::dbgs()
+                     << "  [result] loop result used outside IV store at "
+                     << info.loop.getLoc() << ": " << *user << "\n");
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                       TRANSFORMATION PHASE
+//===----------------------------------------------------------------------===//
+
+/// Ensure a value is available (dominates) at the current insertion point.
+/// If the value is already defined outside `outermost`, return it directly.
+/// Otherwise, rematerialize the computation by cloning through simple ops
+/// (fir.convert, fir.load, arith constants).
+///
+/// `ivFinalMap` maps loop induction variables (block arguments) to their
+/// already-computed final index values.  This allows inner loop bounds that
+/// depend on outer IVs (e.g. triangular loops) to be correctly resolved.
+static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
+                                  OpBuilder &builder, Location loc,
+                                  const DenseMap<Value, Value> &ivFinalMap) {
+  // Already defined outside the outermost loop — use directly.
+  if (auto blockArg = dyn_cast<BlockArgument>(val)) {
+    if (!outermost->isAncestor(blockArg.getOwner()->getParentOp()))
+      return val;
+    auto it = ivFinalMap.find(val);
+    if (it != ivFinalMap.end())
+      return it->second;
+    return val;
+  }
+  if (auto *defOp = val.getDefiningOp()) {
+    if (!outermost->isAncestor(defOp))
+      return val;
+  }
+
+  auto *defOp = val.getDefiningOp();
+  if (!defOp)
+    return val;
+
+  // fir.convert: rematerialize the input, then re-emit the convert.
+  if (auto conv = dyn_cast<fir::ConvertOp>(*defOp)) {
+    auto newInput = rematerializeOutside(conv.getValue(), outermost, builder,
+                                         loc, ivFinalMap);
+    return fir::ConvertOp::create(builder, loc, conv.getType(), newInput);
+  }
+
+  // fir.load: the address must already be outside (alloca/declare/etc).
+  if (auto load = dyn_cast<fir::LoadOp>(*defOp)) {
+    auto addr = rematerializeOutside(load.getMemref(), outermost, builder, loc,
+                                     ivFinalMap);
+    return fir::LoadOp::create(builder, loc, addr);
+  }
+
+  // arith.constant: just clone it.
+  if (isa<arith::ConstantOp>(*defOp)) {
+    auto *cloned = builder.clone(*defOp);
+    return cloned->getResult(0);
+  }
+
+  // Arithmetic ops (addi, subi, muli, divsi, cmpi, select): rematerialize
+  // all operands recursively, then clone the op with new operands.
+  if (isa<arith::AddIOp, arith::SubIOp, arith::MulIOp, arith::DivSIOp,
+          arith::CmpIOp, arith::SelectOp>(*defOp)) {
+    SmallVector<Value> newOperands;
+    for (auto operand : defOp->getOperands())
+      newOperands.push_back(
+          rematerializeOutside(operand, outermost, builder, loc, ivFinalMap));
+    auto *cloned = builder.clone(*defOp);
+    for (unsigned i = 0; i < newOperands.size(); ++i)
+      cloned->setOperand(i, newOperands[i]);
+    return cloned->getResult(0);
+  }
+
+  // For anything else, assume it's already available.
+  return val;
+}
+
+/// Compute the Fortran final IV value and store it to the IV alloca.
+///
+/// Fortran DO loop semantics: after normal completion, the IV holds the
+/// value it would have received on the iteration that causes termination.
+/// For `DO I = lb, ub, step`:
+///   trip_count = MAX((ub - lb + step) / step, 0)
+///   final_iv   = lb + trip_count * step
+///
+/// Since the loop actually executed (we wouldn't reach here otherwise for
+/// an empty nest), we use the FIR loop's own bounds which are already
+/// index-typed. We compute:
+///   final_index = lb + ((ub - lb + step) / step) * step
+/// Then convert to the Fortran IV type (e.g. i32) and store.
+///
+/// `ivFinalMap` is populated with the mapping from this loop's IV (block arg)
+/// to its *last iteration value* (finalIndex - step).  Inner loops whose
+/// bounds depend on an outer IV need the value from the last iteration, not
+/// the Fortran final value (which is one step past the last iteration).
+/// Example: for `do i=1,100; do j=1,i`, j's final value must be computed
+/// with i=100 (last iteration), not i=101 (Fortran final).
+static void emitFinalIVStore(OpBuilder &builder, Location loc, LoopIVInfo &info,
+                             fir::DoLoopOp outermost,
+                             DenseMap<Value, Value> &ivFinalMap) {
+  // Rematerialize bounds outside the outermost loop if needed.
+  // For inner loops with IV-dependent bounds (e.g. do j=1,i), the outer IV
+  // block argument will be resolved via ivFinalMap.
+  Value lb = rematerializeOutside(info.lowerBound, outermost, builder, loc,
+                                  ivFinalMap);
+  Value ub = rematerializeOutside(info.upperBound, outermost, builder, loc,
+                                  ivFinalMap);
+  Value step =
+      rematerializeOutside(info.step, outermost, builder, loc, ivFinalMap);
+
+  // trip_count = (ub - lb + step) / step
+  Value ubMinusLb = arith::SubIOp::create(builder, loc, ub, lb);
+  Value ubMinusLbPlusStep =
+      arith::AddIOp::create(builder, loc, ubMinusLb, step);
+  Value tripCount =
+      arith::DivSIOp::create(builder, loc, ubMinusLbPlusStep, step);
+
+  // Clamp trip count to >= 0.
+  Value zero = arith::ConstantIndexOp::create(builder, loc, 0);
+  Value isPositive = arith::CmpIOp::create(
+      builder, loc, arith::CmpIPredicate::sgt, tripCount, zero);
+  Value clampedTrip =
+      arith::SelectOp::create(builder, loc, isPositive, tripCount, zero);
+
+  // final_index = lb + trip_count * step
+  Value tripTimesStep = arith::MulIOp::create(builder, loc, clampedTrip, step);
+  Value finalIndex = arith::AddIOp::create(builder, loc, lb, tripTimesStep);
+
+  // Record the *last iteration* value (finalIndex - step) for this IV so
+  // that inner loops whose bounds depend on this IV use the correct value.
+  // Fortran final value = lb + trip_count * step (one step PAST the last
+  // iteration), but the inner loop's last execution sees the outer IV at
+  // lb + (trip_count - 1) * step.
+  Value lastIterValue = arith::SubIOp::create(builder, loc, finalIndex, step);
+  ivFinalMap[info.loop.getInductionVar()] = lastIterValue;
+
+  // Convert from index to the Fortran IV type (e.g. i32).
+  Value finalIV = fir::ConvertOp::create(builder, loc, info.ivType, finalIndex);
+
+  // Store to the IV alloca.
+  fir::StoreOp::create(builder, loc, finalIV, info.ivAlloca);
+
+  LLVM_DEBUG(llvm::dbgs() << "  emitted final IV store for " << info.ivAlloca
+                          << " at " << loc << "\n");
+}
+
+/// Transform one loop: remove init/final stores, forward IV loads, strip
+/// iter_args, and rebuild as a simple fir.do_loop.
+static fir::DoLoopOp transformOneLoop(fir::DoLoopOp loop,
+                                      ArrayRef<Value> ivAliases,
+                                      OpBuilder &builder) {
+  auto loc = loop.getLoc();
+  auto iv = loop.getInductionVar();
+  auto iterArg = loop.getRegionIterArgs()[0];
+
+  LLVM_DEBUG(llvm::dbgs() << "  transforming loop at " << loc << "\n");
+
+  // Identify the increment addi (yielded by fir.result).
+  auto resultOp = cast<fir::ResultOp>(loop.getBody()->getTerminator());
+  Operation *incrementOp = nullptr;
+  if (auto addOp = resultOp.getOperand(0).getDefiningOp<arith::AddIOp>())
+    incrementOp = addOp;
+
+  // --- Remove initial store to IV alloca ---
+  // The init store may be:  fir.store %iterArg to %alloca
+  //                    or:  fir.store (fir.convert %iterArg) to %alloca
+  //                    or:  fir.store (fir.convert %iv) to %alloca
+  // Scan the loop body (before any inner loop) and erase the first store
+  // to any IV alias whose value originates from iterArg or the IV.
+  for (auto &op : llvm::make_early_inc_range(*loop.getBody())) {
+    if (auto store = dyn_cast<fir::StoreOp>(op)) {
+      if (llvm::is_contained(ivAliases, store.getMemref()) &&
+          (originatesFrom(store.getValue(), iterArg) ||
+           originatesFrom(store.getValue(), iv))) {
+        // Any dead fir.convert chain feeding this store will be cleaned up
+        // by the subsequent canonicalize pass in the pipeline.
+        store.erase();
+        break; // only remove the first (init) store
+      }
+    }
+  }
+
+  // --- Remove final store: fir.store %loop_result to %iv_alloca ---
+  for (auto result : loop.getResults()) {
+    for (auto *user : llvm::make_early_inc_range(result.getUsers()))
+      if (auto store = dyn_cast<fir::StoreOp>(user))
+        if (llvm::is_contained(ivAliases, store.getMemref()))
+          store.erase();
+  }
+
+  // --- Forward loads of IV alloca anywhere inside loop → fir.convert(IV) ---
+  // The initial store was removed, so loads of the IV alloca inside the
+  // loop (including nested loops) now need to read from the index-typed
+  // induction variable (converted to the IV's Fortran type).
+  loop.walk([&](fir::LoadOp load) {
+    if (llvm::is_contained(ivAliases, load.getMemref())) {
+      builder.setInsertionPoint(load);
+      auto ivCast = fir::ConvertOp::create(builder, loc, load.getType(), iv);
+      load.getResult().replaceAllUsesWith(ivCast);
+      load.erase();
+    }
+  });
+
+  // --- Replace remaining iter_arg uses with fir.convert(IV) ---
+  {
+    SmallVector<OpOperand *> uses;
+    for (auto &use : iterArg.getUses())
+      uses.push_back(&use);
+
+    for (auto *use : uses) {
+      if (use->getOwner() == incrementOp)
+        continue;
+      builder.setInsertionPoint(use->getOwner());
+      auto ivCast = fir::ConvertOp::create(builder, loc, iterArg.getType(), iv);
+      use->set(ivCast);
+    }
+  }
+
+  // --- Clear fir.result operands ---
+  auto *terminator = loop.getBody()->getTerminator();
+  terminator->eraseOperands(0, terminator->getNumOperands());
+
+  // Erase the increment addi (its result was the fir.result operand).
+  if (incrementOp && incrementOp->use_empty())
+    incrementOp->erase();
+
+  // --- Rebuild loop without iter_args ---
+  builder.setInsertionPoint(loop);
+  auto newLoop = fir::DoLoopOp::create(builder, loc, loop.getLowerBound(),
+                                       loop.getUpperBound(), loop.getStep());
+  loop.getInductionVar().replaceAllUsesWith(newLoop.getInductionVar());
+
+  auto &oldOps = loop.getBody()->getOperations();
+  auto &newOps = newLoop.getBody()->getOperations();
+  newOps.splice(newOps.begin(), oldOps, oldOps.begin(),
+                std::prev(oldOps.end()));
+
+  loop.erase();
+  return newLoop;
+}
+
+//===----------------------------------------------------------------------===//
+// Pass entry
+//===----------------------------------------------------------------------===//
+
+class SimplifyDoLoop : public fir::impl::SimplifyDoLoopBase<SimplifyDoLoop> {
+public:
+  void runOnOperation() override {
+    auto func = getOperation();
+
+    // Collect all outermost fir.do_loop ops.
+    SmallVector<fir::DoLoopOp> outerLoops;
+    func.walk([&](fir::DoLoopOp loop) {
+      if (!loop->getParentOfType<fir::DoLoopOp>())
+        outerLoops.push_back(loop);
+    });
+
+    for (auto outerLoop : outerLoops) {
+      auto nestLoops = collectNest(outerLoop);
+      LLVM_DEBUG(llvm::dbgs()
+                 << "SimplifyDoLoop: nest depth " << nestLoops.size() << " at "
+                 << outerLoop.getLoc() << "\n");
+
+      if (nestLoops.empty()) {
+        LLVM_DEBUG(llvm::dbgs() << "  skip (empty nest)\n");
+        continue;
+      }
+
+      // ======== Analysis Phase ========
+      SmallVector<LoopIVInfo> infos;
+      for (auto loop : nestLoops)
+        infos.push_back({loop, {}, {}, {}, {}, {}, {}});
+
+      if (!analyzeNest(infos)) {
+        LLVM_DEBUG(llvm::dbgs() << "  nest rejected by analysis\n");
+        continue;
+      }
+
+      LLVM_DEBUG(llvm::dbgs() << "  analysis passed — transforming "
+                              << infos.size() << " loops\n");
+
+      // ======== Transformation Phase ========
+      OpBuilder builder(func.getContext());
+
+      for (int i = infos.size() - 1; i >= 0; --i)
+        infos[i].loop =
+            transformOneLoop(infos[i].loop, infos[i].ivAliases, builder);
+
+      // ---- After the outermost loop, emit final IV value stores. ----
+      //         Process outer-to-inner so that outer IV final values are
+      //         available when computing inner IV finals (e.g. triangular
+      //         loops where inner bounds depend on outer IVs).
+      fir::DoLoopOp outermostNew = infos.front().loop;
+      builder.setInsertionPointAfter(outermostNew);
+
+      DenseMap<Value, Value> ivFinalMap;
+      for (auto &info : infos)
+        emitFinalIVStore(builder, outermostNew.getLoc(), info, outermostNew,
+                         ivFinalMap);
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::createSimplifyDoLoopPass() {
+  return std::make_unique<SimplifyDoLoop>();
+}
diff --git a/flang/test/Fir/affine-demotion.fir b/flang/test/Fir/affine-demotion.fir
index bdb84be3624cb..635784c5e4bb9 100644
--- a/flang/test/Fir/affine-demotion.fir
+++ b/flang/test/Fir/affine-demotion.fir
@@ -4,27 +4,31 @@
 
 #map0 = affine_map<()[s0, s1] -> (s1 - s0 + 1)>
 #map1 = affine_map<()[s0] -> (s0 + 1)>
-#map2 = affine_map<(d0)[s0, s1, s2] -> (d0 * s2 - s0)>
+#map2 = affine_map<(d0) -> (d0 - 1)>
 module  {
   func.func @calc(%arg0: !fir.ref<!fir.array<?xf32>>, %arg1: !fir.ref<!fir.array<?xf32>>, %arg2: !fir.ref<!fir.array<?xf32>>) {
     %c1 = arith.constant 1 : index
     %c100 = arith.constant 100 : index
     %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+    %a = fir.declare %arg0(%0) {uniq_name = "a"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
+    %b = fir.declare %arg1(%0) {uniq_name = "b"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
     %1 = affine.apply #map0()[%c1, %c100]
     %2 = fir.alloca !fir.array<?xf32>, %1
-    %3 = fir.convert %arg0 : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-    %4 = fir.convert %arg1 : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-    %5 = fir.convert %2 : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
+    %t = fir.declare %2(%0) {uniq_name = "t"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
+    %3 = fir.convert %a : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
+    %4 = fir.convert %b : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
+    %5 = fir.convert %t : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
     affine.for %arg3 = %c1 to #map1()[%c100] {
-      %7 = affine.apply #map2(%arg3)[%c1, %c100, %c1]
+      %7 = affine.apply #map2(%arg3)
       %8 = affine.load %3[%7] : memref<?xf32>
       %9 = affine.load %4[%7] : memref<?xf32>
       %10 = arith.addf %8, %9 : f32
       affine.store %10, %5[%7] : memref<?xf32>
     }
-    %6 = fir.convert %arg2 : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
+    %c = fir.declare %arg2(%0) {uniq_name = "c"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
+    %6 = fir.convert %c : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
     affine.for %arg3 = %c1 to #map1()[%c100] {
-      %7 = affine.apply #map2(%arg3)[%c1, %c100, %c1]
+      %7 = affine.apply #map2(%arg3)
       %8 = affine.load %5[%7] : memref<?xf32>
       %9 = affine.load %4[%7] : memref<?xf32>
       %10 = arith.mulf %8, %9 : f32
@@ -34,35 +38,135 @@ module  {
   }
 }
 
-// CHECK:  func @calc(%[[VAL_0:.*]]: !fir.ref<!fir.array<?xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<?xf32>>, %[[VAL_2:.*]]: !fir.ref<!fir.array<?xf32>>) {
-// CHECK:    %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK:    %[[VAL_4:.*]] = arith.constant 100 : index
-// CHECK:    %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-// CHECK:    %[[VAL_6:.*]] = arith.constant 100 : index
-// CHECK:    %[[VAL_7:.*]] = fir.alloca !fir.array<?xf32>, %[[VAL_6]]
-// CHECK:    %[[VAL_8:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-// CHECK:    %[[VAL_9:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-// CHECK:    %[[VAL_10:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-// CHECK:    affine.for %[[VAL_11:.*]] = 1 to 101 {
-// CHECK:      %[[VAL_12:.*]] = affine.apply #map(%[[VAL_11]]){{\[}}%[[VAL_3]], %[[VAL_4]], %[[VAL_3]]]
-// CHECK:      %[[VAL_13:.*]] = fir.coordinate_of %[[VAL_8]], %[[VAL_12]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      %[[VAL_14:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
-// CHECK:      %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_9]], %[[VAL_12]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<f32>
-// CHECK:      %[[VAL_17:.*]] = arith.addf %[[VAL_14]], %[[VAL_16]] : f32
-// CHECK:      %[[VAL_18:.*]] = fir.coordinate_of %[[VAL_10]], %[[VAL_12]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      fir.store %[[VAL_17]] to %[[VAL_18]] : !fir.ref<f32>
+// CHECK:  func @calc(%[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<?xf32>>, %[[ARG2:.*]]: !fir.ref<!fir.array<?xf32>>) {
+// CHECK:    %[[C1:.*]] = arith.constant 1 : index
+// CHECK:    %[[C100:.*]] = arith.constant 100 : index
+// CHECK:    %[[SHP:.*]] = fir.shape %[[C100]] : (index) -> !fir.shape<1>
+// CHECK:    %[[A:.*]] = fir.declare %[[ARG0]](%[[SHP]]) {uniq_name = "a"}
+// CHECK:    %[[B:.*]] = fir.declare %[[ARG1]](%[[SHP]]) {uniq_name = "b"}
+// CHECK:    %[[ALLOCSZ:.*]] = arith.constant 100 : index
+// CHECK:    %[[ALLOC:.*]] = fir.alloca !fir.array<?xf32>, %[[ALLOCSZ]]
+// CHECK:    %[[T:.*]] = fir.declare %[[ALLOC]](%[[SHP]]) {uniq_name = "t"}
+// fir.convert removed — affine.load/store demoted to fir.array_coor + fir.load/store:
+// CHECK:    affine.for %[[IV1:.*]] = 1 to 101 {
+// CHECK:      %[[IDX1:.*]] = affine.apply #{{.*}}(%[[IV1]])
+// 0-based → 1-based for a(i):
+// CHECK:      %[[C1_A:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_A:.*]] = arith.addi %[[IDX1]], %[[C1_A]] : index
+// CHECK:      %[[A_COOR:.*]] = fir.array_coor %[[A]](%[[SHP]]) %[[FI_A]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      %[[A_VAL:.*]] = fir.load %[[A_COOR]] : !fir.ref<f32>
+// 0-based → 1-based for b(i):
+// CHECK:      %[[C1_B:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_B:.*]] = arith.addi %[[IDX1]], %[[C1_B]] : index
+// CHECK:      %[[B_COOR1:.*]] = fir.array_coor %[[B]](%[[SHP]]) %[[FI_B]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      %[[B_VAL1:.*]] = fir.load %[[B_COOR1]] : !fir.ref<f32>
+// CHECK:      %[[ADD:.*]] = arith.addf %[[A_VAL]], %[[B_VAL1]] : f32
+// 0-based → 1-based for t(i):
+// CHECK:      %[[C1_T:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_T:.*]] = arith.addi %[[IDX1]], %[[C1_T]] : index
+// CHECK:      %[[T_COOR1:.*]] = fir.array_coor %[[T]](%[[SHP]]) %[[FI_T]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      fir.store %[[ADD]] to %[[T_COOR1]] : !fir.ref<f32>
 // CHECK:    }
-// CHECK:    %[[VAL_19:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
-// CHECK:    affine.for %[[VAL_20:.*]] = 1 to 101 {
-// CHECK:      %[[VAL_21:.*]] = affine.apply #map(%[[VAL_20]]){{\[}}%[[VAL_3]], %[[VAL_4]], %[[VAL_3]]]
-// CHECK:      %[[VAL_22:.*]] = fir.coordinate_of %[[VAL_10]], %[[VAL_21]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<f32>
-// CHECK:      %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_9]], %[[VAL_21]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<f32>
-// CHECK:      %[[VAL_26:.*]] = arith.mulf %[[VAL_23]], %[[VAL_25]] : f32
-// CHECK:      %[[VAL_27:.*]] = fir.coordinate_of %[[VAL_19]], %[[VAL_21]] : (!fir.ref<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-// CHECK:      fir.store %[[VAL_26]] to %[[VAL_27]] : !fir.ref<f32>
+// CHECK:    %[[C:.*]] = fir.declare %[[ARG2]](%[[SHP]]) {uniq_name = "c"}
+// CHECK:    affine.for %[[IV2:.*]] = 1 to 101 {
+// CHECK:      %[[IDX2:.*]] = affine.apply #{{.*}}(%[[IV2]])
+// 0-based → 1-based for t(i):
+// CHECK:      %[[C1_T2:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_T2:.*]] = arith.addi %[[IDX2]], %[[C1_T2]] : index
+// CHECK:      %[[T_COOR2:.*]] = fir.array_coor %[[T]](%[[SHP]]) %[[FI_T2]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      %[[T_VAL:.*]] = fir.load %[[T_COOR2]] : !fir.ref<f32>
+// 0-based → 1-based for b(i):
+// CHECK:      %[[C1_B2:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_B2:.*]] = arith.addi %[[IDX2]], %[[C1_B2]] : index
+// CHECK:      %[[B_COOR2:.*]] = fir.array_coor %[[B]](%[[SHP]]) %[[FI_B2]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      %[[B_VAL2:.*]] = fir.load %[[B_COOR2]] : !fir.ref<f32>
+// CHECK:      %[[MUL:.*]] = arith.mulf %[[T_VAL]], %[[B_VAL2]] : f32
+// 0-based → 1-based for c(i):
+// CHECK:      %[[C1_C:.*]] = arith.constant 1 : index
+// CHECK:      %[[FI_C:.*]] = arith.addi %[[IDX2]], %[[C1_C]] : index
+// CHECK:      %[[C_COOR:.*]] = fir.array_coor %[[C]](%[[SHP]]) %[[FI_C]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:      fir.store %[[MUL]] to %[[C_COOR]] : !fir.ref<f32>
 // CHECK:    }
 // CHECK:    return
 // CHECK:  }
+
+// -----
+
+// Test: 2D nested loop demotion with static-shape arrays.
+#map2 = affine_map<()[s0] -> (s0 + 1)>
+#map3 = affine_map<(d0) -> (d0 - 1)>
+module {
+  func.func @calc_2d_static(%arg0: !fir.ref<!fir.array<100x100xf32>>, %arg1: !fir.ref<!fir.array<100x100xf32>>) {
+    %c1 = arith.constant 1 : index
+    %c100 = arith.constant 100 : index
+    %0 = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+    %1 = fir.convert %arg0 : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+    %2 = fir.convert %arg1 : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+    affine.for %arg2 = %c1 to #map2()[%c100] {
+      %3 = affine.apply #map3(%arg2)
+      affine.for %arg3 = %c1 to #map2()[%c100] {
+        %4 = affine.apply #map3(%arg3)
+        %5 = affine.load %1[%3, %4] : memref<100x100xf32>
+        affine.store %5, %2[%3, %4] : memref<100x100xf32>
+      }
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func @calc_2d_static(
+// CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.array<100x100xf32>>, %[[B:.*]]: !fir.ref<!fir.array<100x100xf32>>)
+// CHECK:    %[[C1:.*]] = arith.constant 1 : index
+// CHECK:    %[[C100:.*]] = arith.constant 100 : index
+// CHECK:    %[[SHP:.*]] = fir.shape %[[C100]], %[[C100]] : (index, index) -> !fir.shape<2>
+// fir.convert removed — static arrays use fir.coordinate_of directly:
+// CHECK:    affine.for %[[I:.*]] = 1 to 101 {
+// CHECK:      %[[IDXI:.*]] = affine.apply #{{.*}}(%[[I]])
+// CHECK:      affine.for %[[J:.*]] = 1 to 101 {
+// CHECK:        %[[IDXJ:.*]] = affine.apply #{{.*}}(%[[J]])
+// Indices reversed (row-major memref → column-major Fortran):
+// CHECK:        %[[A_COOR:.*]] = fir.coordinate_of %[[A]], %[[IDXJ]], %[[IDXI]] : (!fir.ref<!fir.array<100x100xf32>>, index, index) -> !fir.ref<f32>
+// CHECK:        %[[A_VAL:.*]] = fir.load %[[A_COOR]] : !fir.ref<f32>
+// CHECK:        %[[B_COOR:.*]] = fir.coordinate_of %[[B]], %[[IDXJ]], %[[IDXI]] : (!fir.ref<!fir.array<100x100xf32>>, index, index) -> !fir.ref<f32>
+// CHECK:        fir.store %[[A_VAL]] to %[[B_COOR]] : !fir.ref<f32>
+// CHECK:      }
+// CHECK:    }
+// CHECK:    return
+
+// -----
+
+// Test: Triangular loop demotion — inner bound depends on outer IV.
+#map4 = affine_map<()[s0] -> (s0 + 1)>
+#map5 = affine_map<(d0) -> (d0 + 1)>
+#map6 = affine_map<(d0) -> (d0 - 1)>
+module {
+  func.func @triangular_demotion(%arg0: !fir.ref<!fir.array<100x100xf32>>, %arg1: !fir.ref<!fir.array<100x100xf32>>) {
+    %c1 = arith.constant 1 : index
+    %c100 = arith.constant 100 : index
+    %0 = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+    %1 = fir.convert %arg0 : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+    %2 = fir.convert %arg1 : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+    affine.for %arg2 = %c1 to #map4()[%c100] {
+      %3 = affine.apply #map6(%arg2)
+      affine.for %arg3 = %c1 to #map5(%arg2) {
+        %4 = affine.apply #map6(%arg3)
+        %5 = affine.load %1[%3, %4] : memref<100x100xf32>
+        affine.store %5, %2[%3, %4] : memref<100x100xf32>
+      }
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func @triangular_demotion(
+// CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.array<100x100xf32>>, %[[B:.*]]: !fir.ref<!fir.array<100x100xf32>>)
+// Outer: constant bound; Inner: IV-dependent bound
+// CHECK:       affine.for %[[I:.*]] = 1 to 101 {
+// CHECK:         affine.for %[[J:.*]] = 1 to #{{.*}}(%[[I]]) {
+// CHECK:           fir.coordinate_of %[[A]], %{{.*}}, %{{.*}} : (!fir.ref<!fir.array<100x100xf32>>, index, index) -> !fir.ref<f32>
+// CHECK:           fir.load %{{.*}} : !fir.ref<f32>
+// CHECK:           fir.coordinate_of %[[B]], %{{.*}}, %{{.*}} : (!fir.ref<!fir.array<100x100xf32>>, index, index) -> !fir.ref<f32>
+// CHECK:           fir.store %{{.*}} to %{{.*}} : !fir.ref<f32>
+// CHECK:         }
+// CHECK:       }
+// CHECK:       return
diff --git a/flang/test/Fir/affine-promotion.fir b/flang/test/Fir/affine-promotion.fir
index 46467ab4a292a..d48d66cbd8a9f 100644
--- a/flang/test/Fir/affine-promotion.fir
+++ b/flang/test/Fir/affine-promotion.fir
@@ -55,16 +55,16 @@ func.func @loop_with_load_and_store(%a1: !arr_d1, %a2: !arr_d1, %a3: !arr_d1) {
 // CHECK:    %[[VAL_8:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
 // CHECK:    %[[VAL_9:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
 // CHECK:    %[[VAL_10:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-// CHECK:    affine.for %[[VAL_11:.*]] = %[[VAL_3]] to #{{.*}}(){{\[}}%[[VAL_4]]] {
-// CHECK:      %[[VAL_12:.*]] = affine.apply #{{.*}}(%[[VAL_11]]){{\[}}%[[VAL_3]], %[[VAL_4]], %[[VAL_3]]]
+// CHECK:    affine.for %[[VAL_11:.*]] = 1 to 101 {
+// CHECK:      %[[VAL_12:.*]] = affine.apply #{{.*}}(%[[VAL_11]])
 // CHECK:      %[[VAL_13:.*]] = affine.load %[[VAL_8]]{{\[}}%[[VAL_12]]] : memref<?xf32>
 // CHECK:      %[[VAL_14:.*]] = affine.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<?xf32>
 // CHECK:      %[[VAL_15:.*]] = arith.addf %[[VAL_13]], %[[VAL_14]] : f32
 // CHECK:      affine.store %[[VAL_15]], %[[VAL_10]]{{\[}}%[[VAL_12]]] : memref<?xf32>
 // CHECK:    }
 // CHECK:    %[[VAL_16:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-// CHECK:    affine.for %[[VAL_17:.*]] = %[[VAL_3]] to #{{.*}}(){{\[}}%[[VAL_4]]] {
-// CHECK:      %[[VAL_18:.*]] = affine.apply #{{.*}}(%[[VAL_17]]){{\[}}%[[VAL_3]], %[[VAL_4]], %[[VAL_3]]]
+// CHECK:    affine.for %[[VAL_17:.*]] = 1 to 101 {
+// CHECK:      %[[VAL_18:.*]] = affine.apply #{{.*}}(%[[VAL_17]])
 // CHECK:      %[[VAL_19:.*]] = affine.load %[[VAL_10]]{{\[}}%[[VAL_18]]] : memref<?xf32>
 // CHECK:      %[[VAL_20:.*]] = affine.load %[[VAL_9]]{{\[}}%[[VAL_18]]] : memref<?xf32>
 // CHECK:      %[[VAL_21:.*]] = arith.mulf %[[VAL_19]], %[[VAL_20]] : f32
@@ -106,32 +106,30 @@ func.func @loop_with_if(%a: !arr_d1, %v: f32) {
   }
   return
 }
-
 // CHECK: func @loop_with_if(%[[VAL_0:.*]]: !fir.ref<!fir.array<?xf32>>, %[[VAL_1:.*]]: f32) {
-// CHECK:   %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:   %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK:   %[[VAL_4:.*]] = arith.constant 2 : index
 // CHECK:   %[[VAL_5:.*]] = arith.constant 100 : index
 // CHECK:   %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
 // CHECK:   %[[VAL_7:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-// CHECK:   affine.for %[[VAL_8:.*]] = %[[VAL_3]] to #{{.*}}(){{\[}}%[[VAL_5]]] {
-// CHECK:     %[[VAL_9:.*]] = affine.apply #{{.*}}(%[[VAL_8]]){{\[}}%[[VAL_3]], %[[VAL_5]], %[[VAL_3]]]
+// CHECK:   affine.for %[[VAL_8:.*]] = 1 to 101 {
+// CHECK:     %[[VAL_9:.*]] = affine.apply #{{.*}}(%[[VAL_8]])
 // CHECK:     affine.store %[[VAL_1]], %[[VAL_7]]{{\[}}%[[VAL_9]]] : memref<?xf32>
 // CHECK:   }
-// CHECK:   affine.for %[[VAL_10:.*]] = %[[VAL_3]] to #{{.*}}(){{\[}}%[[VAL_5]]] {
-// CHECK:     %[[VAL_11:.*]] = affine.apply #{{.*}}(%[[VAL_10]]){{\[}}%[[VAL_3]], %[[VAL_5]], %[[VAL_3]]]
+// CHECK:   affine.for %[[VAL_10:.*]] = 1 to 101 {
+// CHECK:     %[[VAL_11:.*]] = affine.apply #{{.*}}(%[[VAL_10]])
 // CHECK:     affine.store %[[VAL_1]], %[[VAL_7]]{{\[}}%[[VAL_11]]] : memref<?xf32>
 // CHECK:   }
-// CHECK:   affine.for %[[VAL_12:.*]] = %[[VAL_3]] to #{{.*}}(){{\[}}%[[VAL_5]]] {
+// CHECK:   affine.for %[[VAL_12:.*]] = 1 to 101 {
 // CHECK:     %[[VAL_13:.*]] = arith.subi %[[VAL_12]], %[[VAL_4]] : index
 // CHECK:     affine.if #set(%[[VAL_12]]) {
-// CHECK:       %[[VAL_14:.*]] = affine.apply #{{.*}}(%[[VAL_12]]){{\[}}%[[VAL_3]], %[[VAL_5]], %[[VAL_3]]]
+// CHECK:       %[[VAL_14:.*]] = affine.apply #{{.*}}(%[[VAL_12]])
 // CHECK:       affine.store %[[VAL_1]], %[[VAL_7]]{{\[}}%[[VAL_14]]] : memref<?xf32>
 // CHECK:     }
 // CHECK:   }
 // CHECK:   return
 // CHECK: }
 
+
 func.func @loop_with_result(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.ref<!fir.array<100x100xf32>>, %arg2: !fir.ref<!fir.array<100xf32>>) -> f32 {
   %c1 = arith.constant 1 : index
   %cst = arith.constant 0.000000e+00 : f32
@@ -183,32 +181,35 @@ func.func @loop_with_result(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.re
 // CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 // CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
 // CHECK:           %[[VAL_5:.*]] = fir.alloca i32
-// CHECK:           %[[VAL_6:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<100xf32>>) -> memref<?xf32>
-// CHECK:           %[[VAL_7:.*]] = affine.for %[[VAL_8:.*]] = %[[VAL_0]] to #{{.*}}(){{\[}}%[[VAL_2]]] iter_args(%[[VAL_9:.*]] = %[[VAL_1]]) -> (f32) {
-// CHECK:             %[[VAL_10:.*]] = affine.apply #{{.*}}(%[[VAL_8]]){{\[}}%[[VAL_0]], %[[VAL_2]], %[[VAL_0]]]
-// CHECK:             %[[VAL_11:.*]] = affine.load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref<?xf32>
+// First loop promoted — memref<100xf32> (static shape):
+// CHECK:           %[[VAL_6:.*]] = fir.convert %[[ARG0]] : (!fir.ref<!fir.array<100xf32>>) -> memref<100xf32>
+// CHECK:           %[[VAL_7:.*]] = affine.for %[[VAL_8:.*]] = 1 to 101 iter_args(%[[VAL_9:.*]] = %[[VAL_1]]) -> (f32) {
+// CHECK:             %[[VAL_10:.*]] = affine.apply #{{.*}}(%[[VAL_8]])
+// CHECK:             %[[VAL_11:.*]] = affine.load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref<100xf32>
 // CHECK:             %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
 // CHECK:             affine.yield %[[VAL_12]] : f32
 // CHECK:           }
+// Middle loop stays as fir.do_loop (non-promotable: fir.convert reinterprets pointer):
 // CHECK:           %[[VAL_13:.*]]:2 = fir.do_loop %[[VAL_14:.*]] = %[[VAL_0]] to %[[VAL_2]] step %[[VAL_0]] iter_args(%[[VAL_15:.*]] = %[[VAL_7]]) -> (index, f32) {
 // CHECK:             %[[VAL_16:.*]] = fir.array_coor %[[ARG1]](%[[VAL_4]]) %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, index, index) -> !fir.ref<f32>
 // CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<f32>) -> !fir.ref<!fir.array<100xf32>>
-// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (!fir.ref<!fir.array<100xf32>>) -> memref<?xf32>
-// CHECK:             %[[VAL_19:.*]] = affine.for %[[VAL_20:.*]] = %[[VAL_0]] to #{{.*}}(){{\[}}%[[VAL_2]]] iter_args(%[[VAL_21:.*]] = %[[VAL_15]]) -> (f32) {
-// CHECK:               %[[VAL_22:.*]] = affine.apply #{{.*}}(%[[VAL_20]]){{\[}}%[[VAL_0]], %[[VAL_2]], %[[VAL_0]]]
-// CHECK:               %[[VAL_23:.*]] = affine.load %[[VAL_18]]{{\[}}%[[VAL_22]]] : memref<?xf32>
+// Inner loop also stays as fir.do_loop:
+// CHECK:             %[[VAL_19:.*]] = fir.do_loop %[[VAL_20:.*]] = %[[VAL_0]] to %[[VAL_2]] step %[[VAL_0]] iter_args(%[[VAL_21:.*]] = %[[VAL_15]]) -> (f32) {
+// CHECK:               %[[VAL_22:.*]] = fir.array_coor %[[VAL_17]](%[[VAL_3]]) %[[VAL_20]] : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+// CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<f32>
 // CHECK:               %[[VAL_24:.*]] = arith.addf %[[VAL_21]], %[[VAL_23]] fastmath<contract> : f32
-// CHECK:               affine.yield %[[VAL_24]] : f32
+// CHECK:               fir.result %[[VAL_24]] : f32
 // CHECK:             }
 // CHECK:             %[[VAL_25:.*]] = arith.addi %[[VAL_14]], %[[VAL_0]] overflow<nsw> : index
 // CHECK:             fir.result %[[VAL_25]], %[[VAL_19]] : index, f32
 // CHECK:           }
-// CHECK:           %[[VAL_26:.*]] = fir.convert %[[ARG2]] : (!fir.ref<!fir.array<100xf32>>) -> memref<?xf32>
-// CHECK:           %[[VAL_27:.*]]:2 = affine.for %[[VAL_28:.*]] = %[[VAL_0]] to #{{.*}}(){{\[}}%[[VAL_2]]] iter_args(%[[VAL_29:.*]] = %[[VAL_30:.*]]#1, %[[VAL_31:.*]] = %[[VAL_1]]) -> (f32, f32) {
-// CHECK:             %[[VAL_32:.*]] = affine.apply #{{.*}}(%[[VAL_28]]){{\[}}%[[VAL_0]], %[[VAL_2]], %[[VAL_0]]]
-// CHECK:             %[[VAL_33:.*]] = affine.load %[[VAL_6]]{{\[}}%[[VAL_32]]] : memref<?xf32>
+// Last loop promoted — dual reduction:
+// CHECK:           %[[VAL_26:.*]] = fir.convert %[[ARG2]] : (!fir.ref<!fir.array<100xf32>>) -> memref<100xf32>
+// CHECK:           %[[VAL_27:.*]]:2 = affine.for %[[VAL_28:.*]] = 1 to 101 iter_args(%[[VAL_29:.*]] = %[[VAL_30:.*]]#1, %[[VAL_31:.*]] = %[[VAL_1]]) -> (f32, f32) {
+// CHECK:             %[[VAL_32:.*]] = affine.apply #{{.*}}(%[[VAL_28]])
+// CHECK:             %[[VAL_33:.*]] = affine.load %[[VAL_6]]{{\[}}%[[VAL_32]]] : memref<100xf32>
 // CHECK:             %[[VAL_34:.*]] = arith.addf %[[VAL_29]], %[[VAL_33]] fastmath<contract> : f32
-// CHECK:             %[[VAL_35:.*]] = affine.load %[[VAL_26]]{{\[}}%[[VAL_32]]] : memref<?xf32>
+// CHECK:             %[[VAL_35:.*]] = affine.load %[[VAL_26]]{{\[}}%[[VAL_32]]] : memref<100xf32>
 // CHECK:             %[[VAL_36:.*]] = arith.addf %[[VAL_31]], %[[VAL_35]] fastmath<contract> : f32
 // CHECK:             affine.yield %[[VAL_34]], %[[VAL_36]] : f32, f32
 // CHECK:           }
@@ -217,3 +218,99 @@ func.func @loop_with_result(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.re
 // CHECK:           fir.store %[[VAL_39]] to %[[VAL_5]] : !fir.ref<i32>
 // CHECK:           return %[[VAL_37]] : f32
 // CHECK:         }
+
+
+// -----
+
+// Test: Simple matrix multiplication C(j,i) += A(k,i) * B(j,k)
+// Triple-nested loop with all three arrays using 2D indexing.
+// Fortran: do i=1,N; do j=1,N; do k=1,N; C(j,i) = C(j,i) + A(k,i)*B(j,k); end do; end do; end do
+func.func @matmul(%a: !fir.ref<!fir.array<100x100xf32>>, %b: !fir.ref<!fir.array<100x100xf32>>, %c: !fir.ref<!fir.array<100x100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %shp = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+  fir.do_loop %i = %c1 to %c100 step %c1 {
+    fir.do_loop %j = %c1 to %c100 step %c1 {
+      fir.do_loop %k = %c1 to %c100 step %c1 {
+        %k32 = fir.convert %k : (index) -> i32
+        %k64 = fir.convert %k32 : (i32) -> i64
+        %i32 = fir.convert %i : (index) -> i32
+        %i64 = fir.convert %i32 : (i32) -> i64
+        %j32 = fir.convert %j : (index) -> i32
+        %j64 = fir.convert %j32 : (i32) -> i64
+        %a_idx = fir.array_coor %a(%shp) %k64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+        %a_val = fir.load %a_idx : !fir.ref<f32>
+        %b_idx = fir.array_coor %b(%shp) %j64, %k64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+        %b_val = fir.load %b_idx : !fir.ref<f32>
+        %mul = arith.mulf %a_val, %b_val fastmath<contract> : f32
+        %c_idx = fir.array_coor %c(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+        %c_val = fir.load %c_idx : !fir.ref<f32>
+        %sum = arith.addf %c_val, %mul fastmath<contract> : f32
+        fir.store %sum to %c_idx : !fir.ref<f32>
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @matmul(
+// CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.array<100x100xf32>>, %[[B:.*]]: !fir.ref<!fir.array<100x100xf32>>, %[[C:.*]]: !fir.ref<!fir.array<100x100xf32>>)
+// CHECK-DAG:   %[[C100:.*]] = arith.constant 100 : index
+// CHECK:       %[[SHP:.*]] = fir.shape %[[C100]], %[[C100]]
+// CHECK:       %[[AM:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+// CHECK:       %[[BM:.*]] = fir.convert %[[B]] : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+// CHECK:       %[[CM:.*]] = fir.convert %[[C]] : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+// CHECK:       affine.for %[[I:.*]] = 1 to 101 {
+// CHECK:         %[[IDXI:.*]] = affine.apply #{{.*}}(%[[I]])
+// CHECK:         affine.for %[[J:.*]] = 1 to 101 {
+// CHECK:           %[[IDXJ:.*]] = affine.apply #{{.*}}(%[[J]])
+// CHECK:           %[[CVAL:.*]] = affine.load %[[CM]][%[[IDXI]], %[[IDXJ]]] : memref<100x100xf32>
+// CHECK:           affine.for %[[K:.*]] = 1 to 101 {
+// CHECK:             %[[IDXK:.*]] = affine.apply #{{.*}}(%[[K]])
+// CHECK:             %[[AVAL:.*]] = affine.load %[[AM]][%[[IDXI]], %[[IDXK]]] : memref<100x100xf32>
+// CHECK:             %[[BVAL:.*]] = affine.load %[[BM]][%[[IDXK]], %[[IDXJ]]] : memref<100x100xf32>
+// CHECK:             %[[MUL:.*]] = arith.mulf %[[AVAL]], %[[BVAL]] fastmath<contract> : f32
+// CHECK:             %[[SUM:.*]] = arith.addf %[[CVAL]], %[[MUL]] fastmath<contract> : f32
+// CHECK:             affine.store %[[SUM]], %[[CM]][%[[IDXI]], %[[IDXJ]]] : memref<100x100xf32>
+// CHECK:           }
+// CHECK:         }
+// CHECK:       }
+// CHECK:       return
+
+// -----
+
+// Test: Triangular loop promoted to affine.for with dimension-based upper bound.
+// Inner bound j=1..i uses affine_map<(d0) -> (d0 + 1)>(%outer_iv) — a dimension,
+// not a symbol, because the outer IV is a loop induction variable.
+func.func @triangular_promotion(%a: !fir.ref<!fir.array<100x100xf32>>, %b: !fir.ref<!fir.array<100x100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %shp = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+  fir.do_loop %i = %c1 to %c100 step %c1 {
+    fir.do_loop %j = %c1 to %i step %c1 {
+      %j32 = fir.convert %j : (index) -> i32
+      %j64 = fir.convert %j32 : (i32) -> i64
+      %i32 = fir.convert %i : (index) -> i32
+      %i64 = fir.convert %i32 : (i32) -> i64
+      %a_idx = fir.array_coor %a(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      %a_val = fir.load %a_idx : !fir.ref<f32>
+      %b_idx = fir.array_coor %b(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      fir.store %a_val to %b_idx : !fir.ref<f32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @triangular_promotion(
+// CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.array<100x100xf32>>, %[[B:.*]]: !fir.ref<!fir.array<100x100xf32>>)
+// CHECK-DAG:   %[[AM:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+// CHECK-DAG:   %[[BM:.*]] = fir.convert %[[B]] : (!fir.ref<!fir.array<100x100xf32>>) -> memref<100x100xf32>
+// Outer loop: constant bounds (folded by AffineIndexBuilder)
+// CHECK:       affine.for %[[I:.*]] = 1 to 101 {
+// Inner loop: upper bound is affine_map<(d0) -> (d0 + 1)>(%outer_iv) — dimension
+// CHECK:         affine.for %[[J:.*]] = 1 to #{{.*}}(%[[I]]) {
+// CHECK:           affine.load %[[AM]][%{{.*}}, %{{.*}}] : memref<100x100xf32>
+// CHECK:           affine.store %{{.*}}, %[[BM]][%{{.*}}, %{{.*}}] : memref<100x100xf32>
+// CHECK:         }
+// CHECK:       }
+// CHECK:       return
diff --git a/flang/test/Transforms/simplify-do-loop.fir b/flang/test/Transforms/simplify-do-loop.fir
new file mode 100644
index 0000000000000..1cba02b834ade
--- /dev/null
+++ b/flang/test/Transforms/simplify-do-loop.fir
@@ -0,0 +1,322 @@
+// Test simplify-fir-loop pass
+// Canonicalizes fir.do_loop nests by removing shadow iter_args, forwarding
+// IV loads, and emitting final IV value stores after the outermost loop.
+
+// RUN: fir-opt --split-input-file --simplify-fir-loop -cse %s | FileCheck %s
+
+// -----
+
+// Test 1: Simple 1D loop — iter_arg removed, IV loads forwarded, final store
+// emitted after the loop.
+// Fortran: do i = 1, 100; b(i) = a(i); end do
+func.func @simple_1d(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.ref<!fir.array<100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.convert %c1 : (index) -> i32
+  %4 = fir.do_loop %arg2 = %c1 to %c100 step %c1 iter_args(%arg3 = %3) -> (i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %5 = fir.load %2 : !fir.ref<i32>
+    %6 = fir.convert %5 : (i32) -> i64
+    %7 = fir.array_coor %arg0(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    %8 = fir.load %7 : !fir.ref<f32>
+    %9 = fir.array_coor %arg1(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    fir.store %8 to %9 : !fir.ref<f32>
+    %10 = fir.load %2 : !fir.ref<i32>
+    %11 = arith.addi %10, %3 overflow<nsw> : i32
+    fir.result %11 : i32
+  }
+  fir.store %4 to %2 : !fir.ref<i32>
+  return
+}
+
+// CHECK-LABEL: func.func @simple_1d(
+// CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.array<100xf32>>, %[[B:.*]]: !fir.ref<!fir.array<100xf32>>)
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C100:.*]] = arith.constant 100 : index
+// CHECK:       %[[SHAPE:.*]] = fir.shape %[[C100]]
+// CHECK:       %[[IV_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i"
+// CHECK:       %[[IV_DECL:.*]] = fir.declare %[[IV_ALLOCA]]
+// iter_args must be removed:
+// CHECK:       fir.do_loop %[[I:.*]] = %[[C1]] to %[[C100]] step %[[C1]] {
+// CHECK-NOT:     iter_args
+// IV loads must be forwarded to fir.convert(IV):
+// CHECK:         %[[I_I32:.*]] = fir.convert %[[I]] : (index) -> i32
+// CHECK:         %[[I_I64:.*]] = fir.convert %[[I_I32]] : (i32) -> i64
+// CHECK:         %[[A_IDX:.*]] = fir.array_coor %[[A]](%[[SHAPE]]) %[[I_I64]]
+// CHECK:         %[[A_VAL:.*]] = fir.load %[[A_IDX]]
+// CHECK:         %[[B_IDX:.*]] = fir.array_coor %[[B]](%[[SHAPE]]) %[[I_I64]]
+// CHECK:         fir.store %[[A_VAL]] to %[[B_IDX]]
+// CHECK:       }
+// Final IV value: lb + max(0, (ub - lb + step) / step) * step
+// CHECK:       %[[SUB:.*]] = arith.subi %[[C100]], %[[C1]] : index
+// CHECK:       %[[ADD:.*]] = arith.addi %[[SUB]], %[[C1]] : index
+// CHECK:       %[[DIV:.*]] = arith.divsi %[[ADD]], %[[C1]] : index
+// CHECK:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK:       %[[CMP:.*]] = arith.cmpi sgt, %[[DIV]], %[[C0]] : index
+// CHECK:       %[[TRIP:.*]] = arith.select %[[CMP]], %[[DIV]], %[[C0]] : index
+// CHECK:       %[[MUL:.*]] = arith.muli %[[TRIP]], %[[C1]] : index
+// CHECK:       %[[FINALIDX:.*]] = arith.addi %[[C1]], %[[MUL]] : index
+// CHECK:       %[[FINAL:.*]] = fir.convert %[[FINALIDX]] : (index) -> i32
+// CHECK:       fir.store %[[FINAL]] to %[[IV_DECL]]
+// CHECK:       return
+
+// -----
+
+// Test 2: Nested 2D loop — both iter_args removed, final stores for i and j.
+// Fortran: do i = 1,100; do j = 1,100; c(j,i) = a(j,i) + b(j,i); end do; end do
+func.func @nested_2d(%a: !fir.ref<!fir.array<100x100xf32>>, %b: !fir.ref<!fir.array<100x100xf32>>, %c: !fir.ref<!fir.array<100x100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %shp = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+  %ad = fir.declare %a(%shp) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %bd = fir.declare %b(%shp) {uniq_name = "_QFEb"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %cd = fir.declare %c(%shp) {uniq_name = "_QFEc"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %i_alloca = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %i_decl = fir.declare %i_alloca {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %j_alloca = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFEj"}
+  %j_decl = fir.declare %j_alloca {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %init = fir.convert %c1 : (index) -> i32
+  %outer = fir.do_loop %i = %c1 to %c100 step %c1 iter_args(%i_arg = %init) -> (i32) {
+    fir.store %i_arg to %i_decl : !fir.ref<i32>
+    %inner = fir.do_loop %j = %c1 to %c100 step %c1 iter_args(%j_arg = %init) -> (i32) {
+      fir.store %j_arg to %j_decl : !fir.ref<i32>
+      %jv = fir.load %j_decl : !fir.ref<i32>
+      %j64 = fir.convert %jv : (i32) -> i64
+      %iv = fir.load %i_decl : !fir.ref<i32>
+      %i64 = fir.convert %iv : (i32) -> i64
+      %a_idx = fir.array_coor %ad(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      %a_val = fir.load %a_idx : !fir.ref<f32>
+      %b_idx = fir.array_coor %bd(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      %b_val = fir.load %b_idx : !fir.ref<f32>
+      %sum = arith.addf %a_val, %b_val fastmath<contract> : f32
+      %c_idx = fir.array_coor %cd(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      fir.store %sum to %c_idx : !fir.ref<f32>
+      %jv2 = fir.load %j_decl : !fir.ref<i32>
+      %j_next = arith.addi %jv2, %init overflow<nsw> : i32
+      fir.result %j_next : i32
+    }
+    fir.store %inner to %j_decl : !fir.ref<i32>
+    %iv2 = fir.load %i_decl : !fir.ref<i32>
+    %i_next = arith.addi %iv2, %init overflow<nsw> : i32
+    fir.result %i_next : i32
+  }
+  fir.store %outer to %i_decl : !fir.ref<i32>
+  return
+}
+
+// CHECK-LABEL: func.func @nested_2d(
+// Both loops must have no iter_args:
+// CHECK:       fir.do_loop %[[I:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:         fir.do_loop %[[J:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK-NOT:       iter_args
+// IV loads forwarded — j and i accessed via fir.convert of loop IVs:
+// CHECK:           %[[J_I32:.*]] = fir.convert %[[J]] : (index) -> i32
+// CHECK:           %[[J_I64:.*]] = fir.convert %[[J_I32]] : (i32) -> i64
+// CHECK:           %[[I_I32:.*]] = fir.convert %[[I]] : (index) -> i32
+// CHECK:           %[[I_I64:.*]] = fir.convert %[[I_I32]] : (i32) -> i64
+// CHECK:         }
+// CHECK:       }
+// Final stores for both i and j after the outermost loop:
+// CHECK:       fir.store %{{.*}} to %{{.*}} : !fir.ref<i32>
+// CHECK:       fir.store %{{.*}} to %{{.*}} : !fir.ref<i32>
+// CHECK:       return
+
+// -----
+
+// Test 3: Triangular loop — inner bound depends on outer IV.
+// Fortran: do i = 1,100; do j = 1,i; c(j,i) = a(j,i) + b(j,i); end do; end do
+func.func @triangular(%a: !fir.ref<!fir.array<100x100xf32>>, %b: !fir.ref<!fir.array<100x100xf32>>, %c: !fir.ref<!fir.array<100x100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %shp = fir.shape %c100, %c100 : (index, index) -> !fir.shape<2>
+  %ad = fir.declare %a(%shp) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %bd = fir.declare %b(%shp) {uniq_name = "_QFEb"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %cd = fir.declare %c(%shp) {uniq_name = "_QFEc"} : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<100x100xf32>>
+  %i_alloca = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %i_decl = fir.declare %i_alloca {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %j_alloca = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFEj"}
+  %j_decl = fir.declare %j_alloca {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %init = fir.convert %c1 : (index) -> i32
+  %outer = fir.do_loop %i = %c1 to %c100 step %c1 iter_args(%i_arg = %init) -> (i32) {
+    fir.store %i_arg to %i_decl : !fir.ref<i32>
+    %i_val = fir.load %i_decl : !fir.ref<i32>
+    %i_idx = fir.convert %i_val : (i32) -> index
+    %inner = fir.do_loop %j = %c1 to %i_idx step %c1 iter_args(%j_arg = %init) -> (i32) {
+      fir.store %j_arg to %j_decl : !fir.ref<i32>
+      %jv = fir.load %j_decl : !fir.ref<i32>
+      %j64 = fir.convert %jv : (i32) -> i64
+      %iv = fir.load %i_decl : !fir.ref<i32>
+      %i64 = fir.convert %iv : (i32) -> i64
+      %a_idx = fir.array_coor %ad(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      %a_val = fir.load %a_idx : !fir.ref<f32>
+      %b_idx = fir.array_coor %bd(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      %b_val = fir.load %b_idx : !fir.ref<f32>
+      %sum = arith.addf %a_val, %b_val fastmath<contract> : f32
+      %c_idx = fir.array_coor %cd(%shp) %j64, %i64 : (!fir.ref<!fir.array<100x100xf32>>, !fir.shape<2>, i64, i64) -> !fir.ref<f32>
+      fir.store %sum to %c_idx : !fir.ref<f32>
+      %jv2 = fir.load %j_decl : !fir.ref<i32>
+      %j_next = arith.addi %jv2, %init overflow<nsw> : i32
+      fir.result %j_next : i32
+    }
+    fir.store %inner to %j_decl : !fir.ref<i32>
+    %iv2 = fir.load %i_decl : !fir.ref<i32>
+    %i_next = arith.addi %iv2, %init overflow<nsw> : i32
+    fir.result %i_next : i32
+  }
+  fir.store %outer to %i_decl : !fir.ref<i32>
+  return
+}
+
+// CHECK-LABEL: func.func @triangular(
+// Both loops transformed — no iter_args:
+// CHECK:       fir.do_loop %[[I:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// Inner loop bound uses the outer IV (through fir.convert chain):
+// CHECK:         fir.do_loop %[[J:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK-NOT:       iter_args
+// CHECK:         }
+// CHECK:       }
+// Final stores for both i and j after outermost loop:
+// CHECK:       fir.store %{{.*}} to %{{.*}} : !fir.ref<i32>
+// CHECK:       fir.store %{{.*}} to %{{.*}} : !fir.ref<i32>
+// CHECK:       return
+
+// -----
+
+// Test 4: Non-unit step — do i = 1, 100, 3
+func.func @non_unit_step(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.ref<!fir.array<100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c100 = arith.constant 100 : index
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.convert %c1 : (index) -> i32
+  %4 = fir.do_loop %arg2 = %c1 to %c100 step %c3 iter_args(%arg3 = %3) -> (i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %5 = fir.load %2 : !fir.ref<i32>
+    %6 = fir.convert %5 : (i32) -> i64
+    %7 = fir.array_coor %arg0(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    %8 = fir.load %7 : !fir.ref<f32>
+    %9 = fir.array_coor %arg1(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    fir.store %8 to %9 : !fir.ref<f32>
+    %10 = fir.convert %c3 : (index) -> i32
+    %11 = fir.load %2 : !fir.ref<i32>
+    %12 = arith.addi %11, %10 overflow<nsw> : i32
+    fir.result %12 : i32
+  }
+  fir.store %4 to %2 : !fir.ref<i32>
+  return
+}
+
+// CHECK-LABEL: func.func @non_unit_step(
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C100:.*]] = arith.constant 100 : index
+// Loop with step 3 and no iter_args:
+// CHECK:       fir.do_loop %[[I:.*]] = %[[C1]] to %[[C100]] step %[[C3]] {
+// CHECK-NOT:     iter_args
+// CHECK:       }
+// Final IV value: lb + trip_count * step = 1 + 34 * 3 = 103
+// CHECK:       %[[FINAL_I32:.*]] = fir.convert %{{.*}} : (index) -> i32
+// CHECK:       fir.store %[[FINAL_I32]] to %{{.*}} : !fir.ref<i32>
+// CHECK:       return
+
+// -----
+
+// Test 5: Rejection — loop with fir.call must NOT be transformed.
+func.func @rejection_with_call(%arg0: !fir.ref<!fir.array<100xf32>>, %arg1: !fir.ref<!fir.array<100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.convert %c1 : (index) -> i32
+  %4 = fir.do_loop %arg2 = %c1 to %c100 step %c1 iter_args(%arg3 = %3) -> (i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %5 = fir.load %2 : !fir.ref<i32>
+    %6 = fir.convert %5 : (i32) -> i64
+    %7 = fir.array_coor %arg0(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    %8 = fir.load %7 : !fir.ref<f32>
+    fir.call @_QPuser_sub(%7) fastmath<contract> : (!fir.ref<f32>) -> ()
+    %9 = fir.array_coor %arg1(%0) %6 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    fir.store %8 to %9 : !fir.ref<f32>
+    %10 = fir.load %2 : !fir.ref<i32>
+    %11 = arith.addi %10, %3 overflow<nsw> : i32
+    fir.result %11 : i32
+  }
+  fir.store %4 to %2 : !fir.ref<i32>
+  return
+}
+func.func private @_QPuser_sub(!fir.ref<f32>)
+
+// CHECK-LABEL: func.func @rejection_with_call(
+// Loop must remain UNCHANGED — iter_args still present:
+// CHECK:       fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}} = %{{.*}}) -> (i32) {
+// CHECK:         fir.call @_QPuser_sub
+// CHECK:         fir.result %{{.*}} : i32
+// CHECK:       }
+// CHECK:       return
+
+// -----
+
+// Test 6: Rejection — iter_arg carries a reduction value (sum), not the IV.
+// The pass must NOT transform this loop because the iter_arg is not a shadow
+// of the induction variable.
+// Fortran: sum = 0.0; do i = 1, 100; sum = sum + a(i); end do
+func.func @rejection_reduction(%arg0: !fir.ref<!fir.array<100xf32>>, %sum_ref: !fir.ref<f32>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %cst = arith.constant 0.000000e+00 : f32
+  %1 = fir.do_loop %arg1 = %c1 to %c100 step %c1 iter_args(%sum = %cst) -> (f32) {
+    %2 = fir.convert %arg1 : (index) -> i64
+    %3 = fir.array_coor %arg0(%0) %2 : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, i64) -> !fir.ref<f32>
+    %4 = fir.load %3 : !fir.ref<f32>
+    %5 = arith.addf %sum, %4 fastmath<contract> : f32
+    fir.result %5 : f32
+  }
+  fir.store %1 to %sum_ref : !fir.ref<f32>
+  return
+}
+
+// CHECK-LABEL: func.func @rejection_reduction(
+// Loop must remain UNCHANGED — iter_args still present (reduction, not IV):
+// CHECK:       fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}} = %{{.*}}) -> (f32) {
+// CHECK:         arith.addf
+// CHECK:         fir.result %{{.*}} : f32
+// CHECK:       }
+// CHECK:       fir.store
+// CHECK:       return
+
+// -----
+
+// Test 7: Rejection — iter_arg is i32 but init value is NOT fir.convert(lb).
+// The init is an arbitrary constant, not derived from the loop lower bound.
+func.func @rejection_non_iv_init(%arg0: !fir.ref<!fir.array<100xf32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %c42_i32 = arith.constant 42 : i32
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %1 = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
+  %2 = fir.declare %1 {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.do_loop %arg1 = %c1 to %c100 step %c1 iter_args(%x = %c42_i32) -> (i32) {
+    fir.store %x to %2 : !fir.ref<i32>
+    %4 = fir.load %2 : !fir.ref<i32>
+    %c2_i32 = arith.constant 2 : i32
+    %5 = arith.addi %4, %c2_i32 overflow<nsw> : i32
+    fir.result %5 : i32
+  }
+  fir.store %3 to %2 : !fir.ref<i32>
+  return
+}
+
+// CHECK-LABEL: func.func @rejection_non_iv_init(
+// Loop must remain UNCHANGED — init is 42, not fir.convert(lb):
+// CHECK:       %[[C42:.*]] = arith.constant 42 : i32
+// CHECK:       fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}} = %[[C42]]) -> (i32) {
+// CHECK:         fir.result %{{.*}} : i32
+// CHECK:       }
+// CHECK:       return

>From 47c45dfc490bbe4d5a61c17c1596da3f83640a98 Mon Sep 17 00:00:00 2001
From: Shubham Yadav <shuyadav at amd.com>
Date: Wed, 15 Apr 2026 01:32:21 +0530
Subject: [PATCH 2/4] Address review comments

---
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  9 ++--
 .../Optimizer/Transforms/SimplifyDoLoop.cpp   | 45 ++++++++++---------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 34209fb89335f..dffce9aa2e9bb 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -210,15 +210,12 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
     pm.addPass(mlir::createCanonicalizerPass(config));
     pm.addPass(mlir::createCSEPass());
 
-    addNestedPassToAllTopLevelOperations<PassConstructor>(
-        pm, fir::createSimplifyDoLoopPass);
-
-    pm.addPass(mlir::createCanonicalizerPass(config));
-    pm.addPass(mlir::createCSEPass());
-
     pm.addPass(mlir::createLoopInvariantCodeMotionPass());
     pm.addPass(fir::createLoopInvariantCodeMotion());
 
+    addNestedPassToAllTopLevelOperations<PassConstructor>(
+        pm, fir::createSimplifyDoLoopPass);
+
     pm.addPass(mlir::createCanonicalizerPass(config));
     pm.addPass(mlir::createCSEPass());
 
diff --git a/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
index 12f3c5a9a1f91..f48ec7277e7f6 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
@@ -104,9 +104,10 @@ static SmallVector<Value, 2> collectAliases(Value ivRef) {
   return aliases;
 }
 
-/// Collect a perfectly nested chain of fir.do_loop ops starting from `outer`.
-/// A loop is considered perfectly nested if between each nesting level only
-/// IV-related operations (stores, converts) and the inner loop exist.
+/// Collect a singly-nested chain of fir.do_loop ops starting from `outer`.
+/// Each loop body must contain exactly one inner fir.do_loop; other operations
+/// are permitted.  Safety checks (no calls, single IV store, IV doesn't escape)
+/// are enforced later by analyzeNest().
 static SmallVector<fir::DoLoopOp> collectNest(fir::DoLoopOp outer) {
   SmallVector<fir::DoLoopOp> nest;
   fir::DoLoopOp cur = outer;
@@ -259,11 +260,21 @@ static bool noCallsInNest(fir::DoLoopOp outermost) {
 
 static bool ivDoesNotEscape(ArrayRef<Value> ivAliases) {
   for (auto alias : ivAliases)
-    for (auto *user : alias.getUsers())
-      if (!isa<fir::StoreOp, fir::LoadOp, fir::DeclareOp>(user)) {
+    for (auto *user : alias.getUsers()) {
+      if (auto store = dyn_cast<fir::StoreOp>(user)) {
+        if (store.getMemref() != alias) {
+          LLVM_DEBUG(llvm::dbgs()
+                     << "  [escape] IV used as stored value: " << *user
+                     << "\n");
+          return false;
+        }
+        continue;
+      }
+      if (!isa<fir::LoadOp, fir::DeclareOp>(user)) {
         LLVM_DEBUG(llvm::dbgs() << "  [escape] IV escapes: " << *user << "\n");
         return false;
       }
+    }
   return true;
 }
 
@@ -359,25 +370,20 @@ static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
       return it->second;
     return val;
   }
-  if (auto *defOp = val.getDefiningOp()) {
-    if (!outermost->isAncestor(defOp))
-      return val;
-  }
-
   auto *defOp = val.getDefiningOp();
-  if (!defOp)
+  if (!defOp || !outermost->isAncestor(defOp))
     return val;
 
   // fir.convert: rematerialize the input, then re-emit the convert.
   if (auto conv = dyn_cast<fir::ConvertOp>(*defOp)) {
-    auto newInput = rematerializeOutside(conv.getValue(), outermost, builder,
+    Value newInput = rematerializeOutside(conv.getValue(), outermost, builder,
                                          loc, ivFinalMap);
     return fir::ConvertOp::create(builder, loc, conv.getType(), newInput);
   }
 
   // fir.load: the address must already be outside (alloca/declare/etc).
   if (auto load = dyn_cast<fir::LoadOp>(*defOp)) {
-    auto addr = rematerializeOutside(load.getMemref(), outermost, builder, loc,
+    Value addr = rematerializeOutside(load.getMemref(), outermost, builder, loc,
                                      ivFinalMap);
     return fir::LoadOp::create(builder, loc, addr);
   }
@@ -402,7 +408,6 @@ static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
     return cloned->getResult(0);
   }
 
-  // For anything else, assume it's already available.
   return val;
 }
 
@@ -426,8 +431,8 @@ static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
 /// the Fortran final value (which is one step past the last iteration).
 /// Example: for `do i=1,100; do j=1,i`, j's final value must be computed
 /// with i=100 (last iteration), not i=101 (Fortran final).
-static void emitFinalIVStore(OpBuilder &builder, Location loc, LoopIVInfo &info,
-                             fir::DoLoopOp outermost,
+static void emitFinalIVStore(OpBuilder &builder, Location loc,
+                             LoopIVInfo &info, fir::DoLoopOp outermost,
                              DenseMap<Value, Value> &ivFinalMap) {
   // Rematerialize bounds outside the outermost loop if needed.
   // For inner loops with IV-dependent bounds (e.g. do j=1,i), the outer IV
@@ -577,7 +582,7 @@ static fir::DoLoopOp transformOneLoop(fir::DoLoopOp loop,
 class SimplifyDoLoop : public fir::impl::SimplifyDoLoopBase<SimplifyDoLoop> {
 public:
   void runOnOperation() override {
-    auto func = getOperation();
+    mlir::func::FuncOp func = getOperation();
 
     // Collect all outermost fir.do_loop ops.
     SmallVector<fir::DoLoopOp> outerLoops;
@@ -586,8 +591,8 @@ class SimplifyDoLoop : public fir::impl::SimplifyDoLoopBase<SimplifyDoLoop> {
         outerLoops.push_back(loop);
     });
 
-    for (auto outerLoop : outerLoops) {
-      auto nestLoops = collectNest(outerLoop);
+    for (fir::DoLoopOp outerLoop : outerLoops) {
+      SmallVector<fir::DoLoopOp> nestLoops = collectNest(outerLoop);
       LLVM_DEBUG(llvm::dbgs()
                  << "SimplifyDoLoop: nest depth " << nestLoops.size() << " at "
                  << outerLoop.getLoc() << "\n");
@@ -599,7 +604,7 @@ class SimplifyDoLoop : public fir::impl::SimplifyDoLoopBase<SimplifyDoLoop> {
 
       // ======== Analysis Phase ========
       SmallVector<LoopIVInfo> infos;
-      for (auto loop : nestLoops)
+      for (fir::DoLoopOp loop : nestLoops)
         infos.push_back({loop, {}, {}, {}, {}, {}, {}});
 
       if (!analyzeNest(infos)) {

>From 78d5a7612903236918c48baf88499033b053b30e Mon Sep 17 00:00:00 2001
From: Shubham Yadav <shuyadav at amd.com>
Date: Wed, 15 Apr 2026 17:20:10 +0530
Subject: [PATCH 3/4] Address review comments: add canSafelyRematerialize() to
 reject loop nests whose bounds depend on non-IV loads modified inside the
 loop.

---
 .../Optimizer/Transforms/SimplifyDoLoop.cpp   | 99 +++++++++++++++----
 1 file changed, 80 insertions(+), 19 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
index f48ec7277e7f6..6781d4084035e 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyDoLoop.cpp
@@ -34,7 +34,7 @@
 //      a. Remove the initial store (fir.store %iter_arg to %iv_alloca)
 //      b. Forward all loads of IV alloca inside loop body to fir.convert(IV)
 //      todo: the forwarding of load of iv alloca can be done by some other pass
-//      like fir-memref-dataflow-opt pass (if it is available). 
+//      like fir-memref-dataflow-opt pass (if it is available).
 //      c. Strip iter_args and fir.result, rebuild as simple fir.do_loop
 //   2. After the outermost loop, compute and store final IV values
 //      for all loops whose IV is live after the loop (outer to inner order).
@@ -50,6 +50,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
@@ -263,9 +264,8 @@ static bool ivDoesNotEscape(ArrayRef<Value> ivAliases) {
     for (auto *user : alias.getUsers()) {
       if (auto store = dyn_cast<fir::StoreOp>(user)) {
         if (store.getMemref() != alias) {
-          LLVM_DEBUG(llvm::dbgs()
-                     << "  [escape] IV used as stored value: " << *user
-                     << "\n");
+          LLVM_DEBUG(llvm::dbgs() << "  [escape] IV used as stored value: "
+                                  << *user << "\n");
           return false;
         }
         continue;
@@ -278,6 +278,57 @@ static bool ivDoesNotEscape(ArrayRef<Value> ivAliases) {
   return true;
 }
 
+// ---- Check if a bound value can be safely rematerialized after the loop ---
+// Runs during analysis (pre-transformation) to reject nests whose bounds
+// contain ops that cannot be correctly duplicated after the outermost loop.
+//
+// Safe:  values defined outside the outermost loop, loop IVs (block args of
+//        fir.do_loop — resolved via ivFinalMap), fir.convert, arith constants,
+//        and arithmetic over safe values.  Loads of IV allocas are safe because
+//        transformOneLoop will forward them to fir.convert(IV) before
+//        rematerializeOutside runs.
+// Unsafe: fir.load of a non-IV address inside the loop — the memory may have
+//         been modified between the original load and the post-loop insertion
+//         point, so duplicating the load would read a wrong value.
+
+static bool canSafelyRematerialize(Value val, fir::DoLoopOp outermost,
+                                   ArrayRef<LoopIVInfo> infos) {
+  if (auto blockArg = dyn_cast<BlockArgument>(val)) {
+    auto *owner = blockArg.getOwner()->getParentOp();
+    if (!outermost->isAncestor(owner))
+      return true;
+    return isa<fir::DoLoopOp>(owner);
+  }
+
+  auto *defOp = val.getDefiningOp();
+  if (!defOp || !outermost->isAncestor(defOp))
+    return true;
+
+  if (auto conv = dyn_cast<fir::ConvertOp>(*defOp))
+    return canSafelyRematerialize(conv.getValue(), outermost, infos);
+
+  if (auto load = dyn_cast<fir::LoadOp>(*defOp)) {
+    for (const auto &info : infos)
+      if (llvm::is_contained(info.ivAliases, load.getMemref()))
+        return true;
+    LLVM_DEBUG(llvm::dbgs() << "  [remat] non-IV load in bound: " << *defOp
+                            << "\n");
+    return false;
+  }
+
+  if (isa<arith::ConstantOp>(*defOp))
+    return true;
+
+  if (defOp->getNumResults() == 1 && mlir::isPure(defOp)) {
+    for (Value operand : defOp->getOperands())
+      if (!canSafelyRematerialize(operand, outermost, infos))
+        return false;
+    return true;
+  }
+
+  return false;
+}
+
 // ---- Full nest analysis ---------------------------------------------------
 
 static bool analyzeNest(SmallVector<LoopIVInfo> &infos) {
@@ -343,6 +394,19 @@ static bool analyzeNest(SmallVector<LoopIVInfo> &infos) {
     }
   }
 
+  // --- Verify that loop bounds can be safely rematerialized after the loop ---
+  fir::DoLoopOp outermost = infos.front().loop;
+  for (auto &info : infos) {
+    if (!canSafelyRematerialize(info.lowerBound, outermost, infos) ||
+        !canSafelyRematerialize(info.upperBound, outermost, infos) ||
+        !canSafelyRematerialize(info.step, outermost, infos)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  bounds not safely rematerializable at "
+                 << info.loop.getLoc() << "\n");
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -353,7 +417,12 @@ static bool analyzeNest(SmallVector<LoopIVInfo> &infos) {
 /// Ensure a value is available (dominates) at the current insertion point.
 /// If the value is already defined outside `outermost`, return it directly.
 /// Otherwise, rematerialize the computation by cloning through simple ops
-/// (fir.convert, fir.load, arith constants).
+/// (fir.convert, arith constants, arithmetic).
+///
+/// Precondition: canSafelyRematerialize() has already verified that the
+/// bound values do not depend on non-IV loads inside the loop.  Any IV loads
+/// (fir.load of IV alloca) have been forwarded to fir.convert(IV) by
+/// transformOneLoop before this function is called.
 ///
 /// `ivFinalMap` maps loop induction variables (block arguments) to their
 /// already-computed final index values.  This allows inner loop bounds that
@@ -377,27 +446,19 @@ static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
   // fir.convert: rematerialize the input, then re-emit the convert.
   if (auto conv = dyn_cast<fir::ConvertOp>(*defOp)) {
     Value newInput = rematerializeOutside(conv.getValue(), outermost, builder,
-                                         loc, ivFinalMap);
+                                          loc, ivFinalMap);
     return fir::ConvertOp::create(builder, loc, conv.getType(), newInput);
   }
 
-  // fir.load: the address must already be outside (alloca/declare/etc).
-  if (auto load = dyn_cast<fir::LoadOp>(*defOp)) {
-    Value addr = rematerializeOutside(load.getMemref(), outermost, builder, loc,
-                                     ivFinalMap);
-    return fir::LoadOp::create(builder, loc, addr);
-  }
-
   // arith.constant: just clone it.
   if (isa<arith::ConstantOp>(*defOp)) {
     auto *cloned = builder.clone(*defOp);
     return cloned->getResult(0);
   }
 
-  // Arithmetic ops (addi, subi, muli, divsi, cmpi, select): rematerialize
-  // all operands recursively, then clone the op with new operands.
-  if (isa<arith::AddIOp, arith::SubIOp, arith::MulIOp, arith::DivSIOp,
-          arith::CmpIOp, arith::SelectOp>(*defOp)) {
+  // Pure ops (no side effects): rematerialize all operands recursively,
+  // then clone the op with new operands.
+  if (defOp->getNumResults() == 1 && mlir::isPure(defOp)) {
     SmallVector<Value> newOperands;
     for (auto operand : defOp->getOperands())
       newOperands.push_back(
@@ -431,8 +492,8 @@ static Value rematerializeOutside(Value val, fir::DoLoopOp outermost,
 /// the Fortran final value (which is one step past the last iteration).
 /// Example: for `do i=1,100; do j=1,i`, j's final value must be computed
 /// with i=100 (last iteration), not i=101 (Fortran final).
-static void emitFinalIVStore(OpBuilder &builder, Location loc,
-                             LoopIVInfo &info, fir::DoLoopOp outermost,
+static void emitFinalIVStore(OpBuilder &builder, Location loc, LoopIVInfo &info,
+                             fir::DoLoopOp outermost,
                              DenseMap<Value, Value> &ivFinalMap) {
   // Rematerialize bounds outside the outermost loop if needed.
   // For inner loops with IV-dependent bounds (e.g. do j=1,i), the outer IV

>From 332b902bd79e11838d14c17d681acfccc25ebd9d Mon Sep 17 00:00:00 2001
From: Shubham Yadav <shuyadav at amd.com>
Date: Fri, 24 Apr 2026 18:31:49 +0530
Subject: [PATCH 4/4] Add symbolic constant support in AffinePromotion and
 block fir.if promotion

Fix handling of mixed dimension and symbolic constant affine expressions
in loop bound and index analysis. Block fir.if promotion until affine.if
conversion is properly implemented in a follow-up patch.
---
 .../Optimizer/Transforms/AffinePromotion.cpp  | 272 +++++++++++++-----
 flang/test/Fir/affine-promotion.fir           |  35 +--
 2 files changed, 222 insertions(+), 85 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
index da9364e62682d..5bdd6842a0544 100644
--- a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
@@ -50,6 +50,10 @@ struct AffineIfAnalysis;
 struct AffineFunctionAnalysis {
   explicit AffineFunctionAnalysis(mlir::func::FuncOp funcOp) {
     funcOp->walk([&](fir::DoLoopOp doloop) {
+      fir::DoLoopOp outermost = doloop;
+      while (auto parent = outermost->getParentOfType<fir::DoLoopOp>())
+        outermost = parent;
+      outermostLoopMap[doloop] = outermost;
       loopAnalysisMap.try_emplace(doloop, doloop, *this);
     });
   }
@@ -58,26 +62,43 @@ struct AffineFunctionAnalysis {
 
   AffineIfAnalysis getChildIfAnalysis(fir::IfOp op) const;
 
+  fir::DoLoopOp getOutermostLoop(fir::DoLoopOp op) const {
+    auto it = outermostLoopMap.find(op.getOperation());
+    assert(it != outermostLoopMap.end());
+    return it->second;
+  }
+
   llvm::DenseMap<mlir::Operation *, AffineLoopAnalysis> loopAnalysisMap;
   llvm::DenseMap<mlir::Operation *, AffineIfAnalysis> ifAnalysisMap;
+  llvm::DenseMap<mlir::Operation *, fir::DoLoopOp> outermostLoopMap;
 };
 } // namespace
 
 /// Recursively checks whether a value can be expressed as an affine function
-/// of loop induction variables and integer constants.  Walks through
-/// fir.convert (type-cast), arith.addi, arith.subi, and arith.muli (the
-/// latter only when at least one operand is a compile-time constant so the
-/// result stays within MLIR's strict affine expression rules).
-static bool isAffineIndex(mlir::Value val, unsigned depth = 0) {
+/// of loop induction variables, integer constants, and loop-invariant symbols
+/// (values defined outside the outermost loop of the nest).
+///
+/// When \p outermost is provided, values defined outside it are accepted as
+/// valid affine symbols.  When nullptr, only loop IVs and constants are
+/// accepted (legacy behavior).
+static bool isAffineIndex(mlir::Value val,
+                          fir::DoLoopOp outermost = nullptr,
+                          unsigned depth = 0) {
   if (depth > 16)
     return false;
 
   if (auto conv = val.getDefiningOp<fir::ConvertOp>())
-    return isAffineIndex(conv.getValue(), depth + 1);
+    return isAffineIndex(conv.getValue(), outermost, depth + 1);
 
-  if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(val))
-    return isa<fir::DoLoopOp>(blockArg.getOwner()->getParentOp()) ||
-           isa<mlir::affine::AffineForOp>(blockArg.getOwner()->getParentOp());
+  if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(val)) {
+    if (isa<fir::DoLoopOp>(blockArg.getOwner()->getParentOp()) ||
+        isa<mlir::affine::AffineForOp>(blockArg.getOwner()->getParentOp()))
+      return true;
+    if (outermost &&
+        !outermost->isAncestor(blockArg.getOwner()->getParentOp()))
+      return true;
+    return false;
+  }
 
   auto *defOp = val.getDefiningOp();
   if (!defOp)
@@ -87,23 +108,27 @@ static bool isAffineIndex(mlir::Value val, unsigned depth = 0) {
     return true;
 
   if (auto add = dyn_cast<mlir::arith::AddIOp>(defOp))
-    return isAffineIndex(add.getLhs(), depth + 1) &&
-           isAffineIndex(add.getRhs(), depth + 1);
+    return isAffineIndex(add.getLhs(), outermost, depth + 1) &&
+           isAffineIndex(add.getRhs(), outermost, depth + 1);
 
   if (auto sub = dyn_cast<mlir::arith::SubIOp>(defOp))
-    return isAffineIndex(sub.getLhs(), depth + 1) &&
-           isAffineIndex(sub.getRhs(), depth + 1);
+    return isAffineIndex(sub.getLhs(), outermost, depth + 1) &&
+           isAffineIndex(sub.getRhs(), outermost, depth + 1);
 
   if (auto mul = dyn_cast<mlir::arith::MulIOp>(defOp)) {
     auto *lhsDef = mul.getLhs().getDefiningOp();
     auto *rhsDef = mul.getRhs().getDefiningOp();
     if ((lhsDef && isa<mlir::arith::ConstantOp>(lhsDef)) ||
         (rhsDef && isa<mlir::arith::ConstantOp>(rhsDef)))
-      return isAffineIndex(mul.getLhs(), depth + 1) &&
-             isAffineIndex(mul.getRhs(), depth + 1);
+      return isAffineIndex(mul.getLhs(), outermost, depth + 1) &&
+             isAffineIndex(mul.getRhs(), outermost, depth + 1);
     return false;
   }
 
+  // Value defined outside the outermost loop → valid affine symbol.
+  if (outermost && !outermost->isAncestor(defOp))
+    return true;
+
   LLVM_DEBUG(llvm::dbgs() << "AffineLoopAnalysis: index is not an affine "
                              "expression of loop IVs\n";
              defOp->dump());
@@ -113,10 +138,14 @@ static bool isAffineIndex(mlir::Value val, unsigned depth = 0) {
 /// Builds an mlir::AffineExpr by recursively walking the FIR/arith expression
 /// tree rooted at a fir.array_coor index value.  Loop induction variables
 /// become affine dimensions; integer constants are folded into the expression.
+/// Values defined outside the outermost enclosing loop are classified as
+/// affine symbols — they are loop-invariant across the entire nest.
 struct AffineIndexBuilder {
   using MaybeExpr = std::optional<mlir::AffineExpr>;
 
-  explicit AffineIndexBuilder(mlir::MLIRContext *ctx) : context(ctx) {}
+  explicit AffineIndexBuilder(mlir::MLIRContext *ctx,
+                              fir::DoLoopOp outermost = nullptr)
+      : context(ctx), outermostLoop(outermost) {}
 
   MaybeExpr build(mlir::Value val) {
     if (auto conv = val.getDefiningOp<fir::ConvertOp>())
@@ -132,6 +161,9 @@ struct AffineIndexBuilder {
         dims.push_back(val);
         return mlir::getAffineDimExpr(idx, context);
       }
+      if (outermostLoop &&
+          !outermostLoop->isAncestor(blockArg.getOwner()->getParentOp()))
+        return addSymbol(val);
       return {};
     }
 
@@ -167,13 +199,44 @@ struct AffineIndexBuilder {
       return {};
     }
 
+    // Value defined outside the outermost loop → affine symbol.
+    if (outermostLoop && !outermostLoop->isAncestor(defOp))
+      return addSymbol(val);
+
     return {};
   }
 
   mlir::MLIRContext *context;
+  fir::DoLoopOp outermostLoop;
   llvm::SmallVector<mlir::Value> dims;
+  llvm::SmallVector<mlir::Value> syms;
+
+private:
+  MaybeExpr addSymbol(mlir::Value val) {
+    for (unsigned i = 0; i < syms.size(); ++i)
+      if (syms[i] == val)
+        return mlir::getAffineSymbolExpr(i, context);
+    unsigned idx = syms.size();
+    syms.push_back(val);
+    return mlir::getAffineSymbolExpr(idx, context);
+  }
 };
 
+/// Ensure a value is index-typed, inserting a fir.convert immediately after
+/// the value's definition point if needed.  Affine operations require all
+/// dimension and symbol operands to be of index type.
+static mlir::Value castToIndex(mlir::Value val, mlir::PatternRewriter &rewriter) {
+  if (val.getType().isIndex())
+    return val;
+  mlir::OpBuilder::InsertionGuard guard(rewriter);
+  if (auto *defOp = val.getDefiningOp())
+    rewriter.setInsertionPointAfter(defOp);
+  else if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(val))
+    rewriter.setInsertionPointToStart(blockArg.getOwner());
+  return fir::ConvertOp::create(rewriter, val.getLoc(),
+                                rewriter.getIndexType(), val);
+}
+
 namespace {
 struct AffineLoopAnalysis {
   AffineLoopAnalysis() = default;
@@ -193,8 +256,14 @@ struct AffineLoopAnalysis {
       if (!analysis.canPromoteToAffine())
         return false;
     }
-    for (auto ifOp : loopOperation.getOps<fir::IfOp>())
-      functionAnalysis.ifAnalysisMap.try_emplace(ifOp, ifOp, functionAnalysis);
+    // Reject loops containing fir.if until full fir.if → affine.if
+    // promotion is available.  
+    if (!loopOperation.getOps<fir::IfOp>().empty()) {
+      LLVM_DEBUG(llvm::dbgs()
+                     << "AffineLoopAnalysis: loop contains fir.if, "
+                        "skipping (if-promotion not yet enabled)\n");
+      return false;
+    }
     return true;
   }
 
@@ -213,12 +282,15 @@ struct AffineLoopAnalysis {
   bool analyzeLoop(fir::DoLoopOp loopOperation,
                    AffineFunctionAnalysis &functionAnalysis) {
     LLVM_DEBUG(llvm::dbgs() << "AffineLoopAnalysis: \n"; loopOperation.dump(););
-    return analyzeMemoryAccess(loopOperation) &&
+    auto outermost = functionAnalysis.getOutermostLoop(loopOperation);
+    return analyzeBounds(loopOperation, outermost) &&
+           analyzeMemoryAccess(loopOperation, outermost) &&
            analysisResults(loopOperation) &&
            analyzeBody(loopOperation, functionAnalysis);
   }
 
-  bool analyzeReference(mlir::Value memref, mlir::Operation *op) {
+  bool analyzeReference(mlir::Value memref, mlir::Operation *op,
+                        fir::DoLoopOp outermost) {
     if (auto acoOp = memref.getDefiningOp<ArrayCoorOp>()) {
       if (mlir::isa<fir::BoxType>(acoOp.getMemref().getType())) {
         // TODO: Look if and how fir.box can be promoted to affine.
@@ -229,7 +301,7 @@ struct AffineLoopAnalysis {
       }
       bool canPromote = true;
       for (auto coordinate : acoOp.getIndices())
-        canPromote = canPromote && isAffineIndex(coordinate);
+        canPromote = canPromote && isAffineIndex(coordinate, outermost);
       return canPromote;
     }
     if (auto coOp = memref.getDefiningOp<CoordinateOp>()) {
@@ -246,16 +318,47 @@ struct AffineLoopAnalysis {
     return false;
   }
 
-  bool analyzeMemoryAccess(fir::DoLoopOp loopOperation) {
+  bool analyzeMemoryAccess(fir::DoLoopOp loopOperation,
+                           fir::DoLoopOp outermost) {
     for (auto loadOp : loopOperation.getOps<fir::LoadOp>())
-      if (!analyzeReference(loadOp.getMemref(), loadOp))
+      if (!analyzeReference(loadOp.getMemref(), loadOp, outermost))
         return false;
     for (auto storeOp : loopOperation.getOps<fir::StoreOp>())
-      if (!analyzeReference(storeOp.getMemref(), storeOp))
+      if (!analyzeReference(storeOp.getMemref(), storeOp, outermost))
         return false;
     return true;
   }
 
+  bool analyzeBounds(fir::DoLoopOp loopOperation,
+                     fir::DoLoopOp outermost) {
+    // Only promote loops with a positive constant step. The genericBounds
+    // fallback (which attempts to handle variable/negative steps) is broken
+    // — see the comment on that function — so we reject everything that
+    // positiveConstantStep cannot handle.
+    bool hasPositiveConstantStep = false;
+    if (auto defOp = loopOperation.getStep()
+                          .getDefiningOp<mlir::arith::ConstantOp>())
+      if (auto attr = mlir::dyn_cast<IntegerAttr>(defOp.getValue()))
+        hasPositiveConstantStep = attr.getInt() > 0;
+    if (!hasPositiveConstantStep) {
+      LLVM_DEBUG(llvm::dbgs()
+                     << "AffineLoopAnalysis: step is not a positive "
+                        "constant, cannot promote\n");
+      return false;
+    }
+    if (!isAffineIndex(loopOperation.getLowerBound(), outermost)) {
+      LLVM_DEBUG(llvm::dbgs()
+                     << "AffineLoopAnalysis: lower bound not affine\n");
+      return false;
+    }
+    if (!isAffineIndex(loopOperation.getUpperBound(), outermost)) {
+      LLVM_DEBUG(llvm::dbgs()
+                     << "AffineLoopAnalysis: upper bound not affine\n");
+      return false;
+    }
+    return true;
+  }
+
   bool legality{};
 };
 } // namespace
@@ -411,7 +514,7 @@ AffineFunctionAnalysis::getChildIfAnalysis(fir::IfOp op) const {
   if (it == ifAnalysisMap.end()) {
     LLVM_DEBUG(llvm::dbgs() << "AffineFunctionAnalysis: not computed for:\n";
                op.dump(););
-    op.emitError("error in fetching if analysis in AffineFunctionAnalysis\n");
+
     return {};
   }
   return it->getSecond();
@@ -436,7 +539,8 @@ struct MultiDimAffineResult {
 /// MemRefType, and adjusts each per-dimension index from Fortran 1-based to
 /// memref 0-based indexing.
 static MultiDimAffineResult
-createMultiDimAffineOps(mlir::Value arrayRef, mlir::PatternRewriter &rewriter) {
+createMultiDimAffineOps(mlir::Value arrayRef, mlir::PatternRewriter &rewriter,
+                        fir::DoLoopOp outermost) {
   auto acoOp = arrayRef.getDefiningOp<ArrayCoorOp>();
   auto loc = acoOp.getLoc();
   auto *context = acoOp.getContext();
@@ -461,28 +565,41 @@ createMultiDimAffineOps(mlir::Value arrayRef, mlir::PatternRewriter &rewriter) {
   SmallVector<mlir::Value> adjustedIndices;
   auto indices = acoOp.getIndices();
 
+  auto buildOperands = [&](AffineIndexBuilder &builder) {
+    SmallVector<mlir::Value> operands;
+    for (auto &d : builder.dims)
+      operands.push_back(castToIndex(d, rewriter));
+    for (auto &s : builder.syms)
+      operands.push_back(castToIndex(s, rewriter));
+    return operands;
+  };
+
   if (auto shapeOp = acoOp.getShape().getDefiningOp<ShapeOp>()) {
     for (auto idx : indices) {
-      AffineIndexBuilder builder(context);
+      AffineIndexBuilder builder(context, outermost);
       auto expr = builder.build(idx);
       assert(expr && "analysis guaranteed index is affine");
       auto adjustedExpr = *expr - 1;
-      auto map = mlir::AffineMap::get(builder.dims.size(), 0, adjustedExpr);
+      auto map = mlir::AffineMap::get(builder.dims.size(),
+                                      builder.syms.size(), adjustedExpr);
+      auto operands = buildOperands(builder);
       auto adjusted =
-          affine::AffineApplyOp::create(rewriter, loc, map, builder.dims);
+          affine::AffineApplyOp::create(rewriter, loc, map, operands);
       adjustedIndices.push_back(adjusted.getResult());
     }
   } else if (auto shapeShiftOp =
                  acoOp.getShape().getDefiningOp<ShapeShiftOp>()) {
     auto pairs = shapeShiftOp.getPairs();
     for (unsigned i = 0; i < indices.size(); ++i) {
-      AffineIndexBuilder builder(context);
+      AffineIndexBuilder builder(context, outermost);
       auto expr = builder.build(indices[i]);
       assert(expr && "analysis guaranteed index is affine");
-      auto adjustedExpr = *expr - mlir::getAffineSymbolExpr(0, context);
-      auto map = mlir::AffineMap::get(builder.dims.size(), 1, adjustedExpr);
-      SmallVector<mlir::Value> operands;
-      operands.append(builder.dims.begin(), builder.dims.end());
+      unsigned extraSymIdx = builder.syms.size();
+      auto adjustedExpr =
+          *expr - mlir::getAffineSymbolExpr(extraSymIdx, context);
+      auto map = mlir::AffineMap::get(builder.dims.size(),
+                                      builder.syms.size() + 1, adjustedExpr);
+      auto operands = buildOperands(builder);
       operands.push_back(pairs[i * 2]);
       auto adjusted =
           affine::AffineApplyOp::create(rewriter, loc, map, operands);
@@ -491,15 +608,16 @@ createMultiDimAffineOps(mlir::Value arrayRef, mlir::PatternRewriter &rewriter) {
   } else if (auto sliceOp = acoOp.getShape().getDefiningOp<SliceOp>()) {
     auto triples = sliceOp.getTriples();
     for (unsigned i = 0; i < indices.size(); ++i) {
-      AffineIndexBuilder builder(context);
+      AffineIndexBuilder builder(context, outermost);
       auto expr = builder.build(indices[i]);
       assert(expr && "analysis guaranteed index is affine");
-      auto lbSym = mlir::getAffineSymbolExpr(0, context);
-      auto strideSym = mlir::getAffineSymbolExpr(1, context);
+      unsigned extraSymBase = builder.syms.size();
+      auto lbSym = mlir::getAffineSymbolExpr(extraSymBase, context);
+      auto strideSym = mlir::getAffineSymbolExpr(extraSymBase + 1, context);
       auto adjustedExpr = (*expr - lbSym).floorDiv(strideSym);
-      auto map = mlir::AffineMap::get(builder.dims.size(), 2, adjustedExpr);
-      SmallVector<mlir::Value> operands;
-      operands.append(builder.dims.begin(), builder.dims.end());
+      auto map = mlir::AffineMap::get(builder.dims.size(),
+                                      builder.syms.size() + 2, adjustedExpr);
+      auto operands = buildOperands(builder);
       operands.push_back(triples[i * 3]);
       operands.push_back(triples[i * 3 + 2]);
       auto adjusted =
@@ -515,28 +633,31 @@ createMultiDimAffineOps(mlir::Value arrayRef, mlir::PatternRewriter &rewriter) {
   return {std::move(adjustedIndices), arrayConvert};
 }
 
-static void rewriteLoad(fir::LoadOp loadOp, mlir::PatternRewriter &rewriter) {
+static void rewriteLoad(fir::LoadOp loadOp, mlir::PatternRewriter &rewriter,
+                        fir::DoLoopOp outermost) {
   rewriter.setInsertionPoint(loadOp);
-  auto result = createMultiDimAffineOps(loadOp.getMemref(), rewriter);
+  auto result = createMultiDimAffineOps(loadOp.getMemref(), rewriter, outermost);
   rewriter.replaceOpWithNewOp<affine::AffineLoadOp>(
       loadOp, result.arrayConvert.getResult(), result.indices);
 }
 
 static void rewriteStore(fir::StoreOp storeOp,
-                         mlir::PatternRewriter &rewriter) {
+                         mlir::PatternRewriter &rewriter,
+                         fir::DoLoopOp outermost) {
   rewriter.setInsertionPoint(storeOp);
-  auto result = createMultiDimAffineOps(storeOp.getMemref(), rewriter);
+  auto result = createMultiDimAffineOps(storeOp.getMemref(), rewriter, outermost);
   rewriter.replaceOpWithNewOp<affine::AffineStoreOp>(
       storeOp, storeOp.getValue(), result.arrayConvert.getResult(),
       result.indices);
 }
 
-static void rewriteMemoryOps(Block *block, mlir::PatternRewriter &rewriter) {
+static void rewriteMemoryOps(Block *block, mlir::PatternRewriter &rewriter,
+                              fir::DoLoopOp outermost = {}) {
   for (auto &bodyOp : llvm::make_early_inc_range(block->getOperations())) {
     if (isa<fir::LoadOp>(bodyOp))
-      rewriteLoad(cast<fir::LoadOp>(bodyOp), rewriter);
+      rewriteLoad(cast<fir::LoadOp>(bodyOp), rewriter, outermost);
     else if (isa<fir::StoreOp>(bodyOp))
-      rewriteStore(cast<fir::StoreOp>(bodyOp), rewriter);
+      rewriteStore(cast<fir::StoreOp>(bodyOp), rewriter, outermost);
   }
 }
 
@@ -601,7 +722,8 @@ class AffineLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
     loop.getInductionVar().replaceAllUsesWith(inductionVar);
     rewriter.finalizeOpModification(loop.getOperation());
 
-    rewriteMemoryOps(affineFor.getBody(), rewriter);
+    auto outermost = functionAnalysis.getOutermostLoop(loop);
+    rewriteMemoryOps(affineFor.getBody(), rewriter, outermost);
 
     LLVM_DEBUG(llvm::dbgs() << "AffineLoopConversion: loop rewriten to:\n";
                affineFor.dump(););
@@ -619,42 +741,56 @@ class AffineLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
     return genericBounds(op, rewriter);
   }
 
-  /// Build an AffineMap + operands for a single loop bound using
-  /// AffineIndexBuilder.  Reuses the same recursive decomposition used for
-  /// array indices: fir.convert, arith.addi/subi/muli, constants, and
-  /// enclosing loop IVs are all handled uniformly.
-  ///
-  /// If the bound is an affine expression of enclosing loop IVs and
-  /// constants, those IVs become dimensions in the map (as required by the
-  /// affine verifier).  Otherwise the raw value is treated as a symbol.
-  static mlir::AffineMap boundMap(mlir::Value operand, int64_t offset,
-                                  mlir::MLIRContext *ctx,
-                                  SmallVectorImpl<mlir::Value> &mapOperands) {
-    AffineIndexBuilder builder(ctx);
+  mlir::AffineMap boundMap(mlir::Value operand, int64_t offset,
+                           mlir::MLIRContext *ctx,
+                           SmallVectorImpl<mlir::Value> &mapOperands,
+                           fir::DoLoopOp outermost,
+                           mlir::PatternRewriter &rewriter) const {
+    AffineIndexBuilder builder(ctx, outermost);
     if (auto expr = builder.build(operand)) {
-      mapOperands.append(builder.dims.begin(), builder.dims.end());
-      return mlir::AffineMap::get(builder.dims.size(), /*symbolCount=*/0,
+      for (auto &d : builder.dims)
+        mapOperands.push_back(castToIndex(d, rewriter));
+      for (auto &s : builder.syms)
+        mapOperands.push_back(castToIndex(s, rewriter));
+      return mlir::AffineMap::get(builder.dims.size(), builder.syms.size(),
                                   *expr + offset);
     }
-    mapOperands.push_back(operand);
-    return mlir::AffineMap::get(/*dimCount=*/0, /*symbolCount=*/1,
-                                mlir::getAffineSymbolExpr(0, ctx) + offset);
+    llvm_unreachable("analysis should have rejected non-affine bounds");
   }
 
-  // when step for the loop is positive compile time constant
   std::pair<affine::AffineForOp, mlir::Value>
   positiveConstantStep(fir::DoLoopOp op, int64_t step,
                        mlir::PatternRewriter &rewriter) const {
     auto *ctx = op.getContext();
+    auto outermost = functionAnalysis.getOutermostLoop(op);
     SmallVector<mlir::Value> lbOperands, ubOperands;
-    auto lbMap = boundMap(op.getLowerBound(), 0, ctx, lbOperands);
-    auto ubMap = boundMap(op.getUpperBound(), 1, ctx, ubOperands);
+    auto lbMap =
+        boundMap(op.getLowerBound(), 0, ctx, lbOperands, outermost, rewriter);
+    auto ubMap =
+        boundMap(op.getUpperBound(), 1, ctx, ubOperands, outermost, rewriter);
     auto affineFor = affine::AffineForOp::create(
         rewriter, op.getLoc(), lbOperands, lbMap, ubOperands, ubMap, step,
         op.getIterOperands());
     return std::make_pair(affineFor, affineFor.getInductionVar());
   }
 
+  // KNOWN FLAWED: This function attempts to normalize any fir.do_loop
+  // (variable step, negative step) into a 0-based step-1 affine.for by
+  // computing trip_count = (ub - lb + step) / step and reconstructing
+  // the original index as actual_i = lb + ii * step.
+  //
+  // Flaws:
+  //   1. Operand classification: lb, ub, step are passed as affine symbols,
+  //      but they may be dimensions (e.g. enclosing loop IVs), causing the
+  //      MLIR verifier to reject with "dimensional operand cannot be used
+  //      as a symbol".
+  //   2. The index reconstruction (lb + ii * step) involves multiplying a
+  //      dimension (ii) by a variable (step), which is not a valid affine
+  //      expression when step is not a compile-time constant.
+  //
+  // Currently unreachable: analyzeBounds rejects loops with non-positive-
+  // constant steps, so createAffineFor always takes the
+  // positiveConstantStep path. This function is kept for reference only.
   std::pair<affine::AffineForOp, mlir::Value>
   genericBounds(fir::DoLoopOp op, mlir::PatternRewriter &rewriter) const {
     auto lowerBound = mlir::getAffineSymbolExpr(0, op.getContext());
diff --git a/flang/test/Fir/affine-promotion.fir b/flang/test/Fir/affine-promotion.fir
index d48d66cbd8a9f..1777044913b16 100644
--- a/flang/test/Fir/affine-promotion.fir
+++ b/flang/test/Fir/affine-promotion.fir
@@ -106,24 +106,25 @@ func.func @loop_with_if(%a: !arr_d1, %v: f32) {
   }
   return
 }
+// Loops containing fir.if are not promoted (if-promotion deferred to a later patch).
+// The entire nest stays as fir.do_loop.
 // CHECK: func @loop_with_if(%[[VAL_0:.*]]: !fir.ref<!fir.array<?xf32>>, %[[VAL_1:.*]]: f32) {
-// CHECK:   %[[VAL_4:.*]] = arith.constant 2 : index
-// CHECK:   %[[VAL_5:.*]] = arith.constant 100 : index
-// CHECK:   %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-// CHECK:   %[[VAL_7:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<?xf32>>) -> memref<?xf32>
-// CHECK:   affine.for %[[VAL_8:.*]] = 1 to 101 {
-// CHECK:     %[[VAL_9:.*]] = affine.apply #{{.*}}(%[[VAL_8]])
-// CHECK:     affine.store %[[VAL_1]], %[[VAL_7]]{{\[}}%[[VAL_9]]] : memref<?xf32>
-// CHECK:   }
-// CHECK:   affine.for %[[VAL_10:.*]] = 1 to 101 {
-// CHECK:     %[[VAL_11:.*]] = affine.apply #{{.*}}(%[[VAL_10]])
-// CHECK:     affine.store %[[VAL_1]], %[[VAL_7]]{{\[}}%[[VAL_11]]] : memref<?xf32>
-// CHECK:   }
-// CHECK:   affine.for %[[VAL_12:.*]] = 1 to 101 {
-// CHECK:     %[[VAL_13:.*]] = arith.subi %[[VAL_12]], %[[VAL_4]] : index
-// CHECK:     affine.if #set(%[[VAL_12]]) {
-// CHECK:       %[[VAL_14:.*]] = affine.apply #{{.*}}(%[[VAL_12]])
-// CHECK:       affine.store %[[VAL_1]], %[[VAL_7]]{{\[}}%[[VAL_14]]] : memref<?xf32>
+// CHECK:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK:   %[[C100:.*]] = arith.constant 100 : index
+// CHECK:   %[[DIMS:.*]] = fir.shape %[[C100]] : (index) -> !fir.shape<1>
+// CHECK:   fir.do_loop %[[I:.*]] = %[[C1]] to %[[C100]] step %[[C1]] {
+// CHECK:     fir.do_loop %[[J:.*]] = %[[C1]] to %[[C100]] step %[[C1]] {
+// CHECK:       fir.do_loop %[[K:.*]] = %[[C1]] to %[[C100]] step %[[C1]] {
+// CHECK:         %[[IM2:.*]] = arith.subi %[[I]], %[[C2]] : index
+// CHECK:         %[[COND:.*]] = arith.cmpi sgt, %[[IM2]], %[[C0]] : index
+// CHECK:         fir.if %[[COND]] {
+// CHECK:           fir.store %[[VAL_1]] to %{{.*}} : !fir.ref<f32>
+// CHECK:         }
+// CHECK:         fir.store %[[VAL_1]] to %{{.*}} : !fir.ref<f32>
+// CHECK:         fir.store %[[VAL_1]] to %{{.*}} : !fir.ref<f32>
+// CHECK:       }
 // CHECK:     }
 // CHECK:   }
 // CHECK:   return