[Mlir-commits] [mlir] [mlir] Add loop bounds normalization pass (PR #93781)

Thu May 30 13:58:25 PDT 2024

https://github.com/jtuyls updated https://github.com/llvm/llvm-project/pull/93781

>From ada2733504215145552d34a88ce485b1e9157920 Mon Sep 17 00:00:00 2001
From: Jorn Tuyls <jorn.tuyls at gmail.com>
Date: Thu, 30 May 2024 00:45:10 -0700
Subject: [PATCH] [mlir] Add loop bound normalization pass

---
 mlir/include/mlir/Dialect/SCF/IR/SCFOps.td    |  97 +++++++
 mlir/include/mlir/Dialect/Utils/LoopUtils.h   |  30 ++
 .../mlir/Interfaces/LoopLikeInterface.h       |   4 +
 .../mlir/Interfaces/LoopLikeInterface.td      | 126 +++++++++
 mlir/include/mlir/Transforms/Passes.h         |   4 +
 mlir/include/mlir/Transforms/Passes.td        |   6 +
 mlir/lib/Dialect/SCF/IR/SCF.cpp               |  60 ++++
 mlir/lib/Dialect/SCF/Utils/Utils.cpp          |  55 +---
 mlir/lib/Dialect/Utils/CMakeLists.txt         |   1 +
 mlir/lib/Dialect/Utils/LoopUtils.cpp          |  52 ++++
 mlir/lib/IR/Operation.cpp                     |   2 -
 mlir/lib/Interfaces/LoopLikeInterface.cpp     |  25 ++
 mlir/lib/Transforms/CMakeLists.txt            |   2 +
 mlir/lib/Transforms/NormalizeLoopBounds.cpp   | 118 ++++++++
 mlir/test/Dialect/Affine/loop-coalescing.mlir |  11 +-
 .../Transforms/normalize-loop-bounds.mlir     | 266 ++++++++++++++++++
 16 files changed, 796 insertions(+), 63 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/Utils/LoopUtils.h
 create mode 100644 mlir/lib/Dialect/Utils/LoopUtils.cpp
 create mode 100644 mlir/lib/Transforms/NormalizeLoopBounds.cpp
 create mode 100644 mlir/test/Transforms/normalize-loop-bounds.mlir

diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index 0b063aa772bab..0e23257456223 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -140,6 +140,7 @@ def ForOp : SCF_Op<"for",
         "getSingleUpperBound", "getYieldedValuesMutable",
         "promoteIfSingleIteration", "replaceWithAdditionalYields",
         "yieldTiledValuesAndReplace"]>,
+       LoopLikeWithInductionVarsOpInterface,
        AllTypesMatch<["lowerBound", "upperBound", "step"]>,
        ConditionallySpeculatable,
        DeclareOpInterfaceMethods<RegionBranchOpInterface,
@@ -267,6 +268,74 @@ def ForOp : SCF_Op<"for",
       return getBody()->getArguments().drop_front(getNumInductionVars())[index];
     }
 
+    /// Return the induction variables.
+    ::mlir::ValueRange getInductionVars() {
+      return getBody()->getArguments().take_front(getNumInductionVars());
+    }
+
+    /// Get lower bounds as `OpFoldResult`.
+    SmallVector<OpFoldResult> getMixedLowerBound() {
+      return {getAsOpFoldResult(getLowerBound())};
+    }
+
+    /// Get upper bounds as `OpFoldResult`.
+    SmallVector<OpFoldResult> getMixedUpperBound() {
+      return {getAsOpFoldResult(getUpperBound())};
+    }
+
+    // Get steps as `OpFoldResult`.
+    SmallVector<OpFoldResult> getMixedStep() {
+      return {getAsOpFoldResult(getStep())};
+    }
+
+    /// Get lower bounds as values.
+    SmallVector<Value> getLowerBound(OpBuilder &b) {
+      return ValueRange{getLowerBound()};
+    }
+
+    /// Get upper bounds as values.
+    SmallVector<Value> getUpperBound(OpBuilder &b) {
+      return ValueRange{getUpperBound()};
+    }
+
+    /// Get steps as values.
+    SmallVector<Value> getStep(OpBuilder &b) {
+      return ValueRange{getStep()};
+    }
+
+    /// Set the lower bounds from `OpFoldResult`.
+    void setMixedLowerBounds(OpBuilder &b, ArrayRef<OpFoldResult> lbs) {
+      setLowerBound(getValueOrCreateConstantIndexOp(b, getLoc(), lbs[0]));
+    }
+
+    /// Set the upper bounds from `OpFoldResult`.
+    void setMixedUpperBounds(OpBuilder &b, ArrayRef<OpFoldResult> ubs) {
+      setUpperBound(getValueOrCreateConstantIndexOp(b, getLoc(), ubs[0]));
+    }
+
+    /// Set the steps from `OpFoldResult`.
+    void setMixedSteps(OpBuilder &b, ArrayRef<OpFoldResult> steps) {
+      setStep(getValueOrCreateConstantIndexOp(b, getLoc(), steps[0]));
+    }
+
+    /// Set the lower bounds from values.
+    void setLowerBounds(ArrayRef<Value> lbs) {
+      assert(lbs.size() == 1 && "expected a single lower bound");
+      setLowerBound(lbs[0]);
+    }
+
+    /// Set the upper bounds from values.
+    void setUpperBounds(ArrayRef<Value> ubs) {
+      assert(ubs.size() == 1 && "expected a single upper bound");
+      setUpperBound(ubs[0]);
+    }
+
+    /// Set the steps from values.
+    void setSteps(ArrayRef<Value> steps) {
+      assert(steps.size() == 1 && "expected a single step");
+      setStep(steps[0]);
+    }
+
     void setLowerBound(Value bound) { getOperation()->setOperand(0, bound); }
     void setUpperBound(Value bound) { getOperation()->setOperand(1, bound); }
     void setStep(Value step) { getOperation()->setOperand(2, step); }
@@ -304,6 +373,7 @@ def ForallOp : SCF_Op<"forall", [
           ["getInitsMutable", "getRegionIterArgs", "getSingleInductionVar", 
            "getSingleLowerBound", "getSingleUpperBound", "getSingleStep",
            "promoteIfSingleIteration", "yieldTiledValuesAndReplace"]>,
+       LoopLikeWithInductionVarsOpInterface,
        RecursiveMemoryEffects,
        SingleBlockImplicitTerminator<"scf::InParallelOp">,
        DeclareOpInterfaceMethods<RegionBranchOpInterface>,
@@ -543,6 +613,33 @@ def ForallOp : SCF_Op<"forall", [
       return getValueOrCreateConstantIndexOp(b, getLoc(), getMixedStep());
     }
 
+    /// Set the lower bounds from `OpFoldResult`.
+    void setMixedLowerBounds(OpBuilder &b, ArrayRef<OpFoldResult> lbs);
+
+    /// Set the upper bounds from `OpFoldResult`.
+    void setMixedUpperBounds(OpBuilder &b, ArrayRef<OpFoldResult> ubs);
+
+    /// Set the steps from `OpFoldResult`.
+    void setMixedSteps(OpBuilder &b, ArrayRef<OpFoldResult> steps);
+
+    /// Set the lower bounds from values.
+    void setLowerBounds(ArrayRef<Value> lbs) {
+      OpBuilder b(getOperation()->getContext());
+      return setMixedLowerBounds(b, getAsOpFoldResult(lbs));
+    }
+
+    /// Set the upper bounds from values.
+    void setUpperBounds(ArrayRef<Value> ubs) {
+      OpBuilder b(getOperation()->getContext());
+      return setMixedUpperBounds(b, getAsOpFoldResult(ubs));
+    }
+
+    /// Set the steps from values.
+    void setSteps(ArrayRef<Value> steps) {
+      OpBuilder b(getOperation()->getContext());
+      return setMixedSteps(b, getAsOpFoldResult(steps));
+    }
+
     int64_t getRank() { return getStaticLowerBound().size(); }
 
     /// Number of operands controlling the loop: lbs, ubs, steps
diff --git a/mlir/include/mlir/Dialect/Utils/LoopUtils.h b/mlir/include/mlir/Dialect/Utils/LoopUtils.h
new file mode 100644
index 0000000000000..15e901dc0e45e
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Utils/LoopUtils.h
@@ -0,0 +1,30 @@
+//===- LoopUtils.h - Helpers related to loop operations ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines utilities for loop operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+
+// This structure is to pass and return sets of loop parameters without
+// confusing the order.
+struct LoopParams {
+  Value lowerBound;
+  Value upperBound;
+  Value step;
+};
+
+/// Calculate the normalized loop upper bounds with lower bound equal to zero
+/// and step equal to one.
+LoopParams emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc,
+                                    Value lb, Value ub, Value step);
+
+} // namespace mlir
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.h b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
index 42609e824c86a..fab5ffa26e574 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.h
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_INTERFACES_LOOPLIKEINTERFACE_H_
 #define MLIR_INTERFACES_LOOPLIKEINTERFACE_H_
 
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/OpDefinition.h"
 
 namespace mlir {
@@ -28,6 +29,9 @@ using NewYieldValuesFn = std::function<SmallVector<Value>(
 namespace detail {
 /// Verify invariants of the LoopLikeOpInterface.
 LogicalResult verifyLoopLikeOpInterface(Operation *op);
+
+/// Verify invariants of the LoopLikeWithInductionVarsOpInterface.
+LogicalResult verifyLoopLikeWithInductionVarsOpInterface(Operation *op);
 } // namespace detail
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.td b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
index f0dc6e60eba58..d7580616efaa4 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.td
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
@@ -375,6 +375,132 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
   }];
 }
 
+def LoopLikeWithInductionVarsOpInterface 
+    : OpInterface<"LoopLikeWithInductionVarsOpInterface", [LoopLikeOpInterface]> {
+  let description = [{
+    Interface for loop-like operations with one or more induction variables.
+    This interface contains helper functions for retrieving and updating the
+    lower bound, upper bound and step size for each induction variable and
+    provides a utility function to check whether the loop is normalized., i.e.
+    all lower bounds are equal to zero and steps are equal to one.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+        Return the induction variables if they exist, otherwise return
+        std::nullopt.
+      }],
+      /*retTy=*/"::mlir::ValueRange",
+      /*methodName=*/"getInductionVars"
+    >,
+    InterfaceMethod<[{
+        Return the lower bound values or attributes as OpFoldResult.
+      }],
+      /*retTy=*/"SmallVector<::mlir::OpFoldResult>",
+      /*methodName=*/"getMixedLowerBound"
+    >,
+    InterfaceMethod<[{
+        Return the step values or attributes if they exist as OpFoldResult.
+      }],
+      /*retTy=*/"SmallVector<::mlir::OpFoldResult>",
+      /*methodName=*/"getMixedStep"
+    >,
+    InterfaceMethod<[{
+        Return the upper bound values or attributes as OpFoldResult.
+      }],
+      /*retTy=*/"SmallVector<::mlir::OpFoldResult>",
+      /*methodName=*/"getMixedUpperBound"
+    >,
+    InterfaceMethod<[{
+        Return the lower bounds as values.
+      }],
+      /*retTy=*/"SmallVector<Value>",
+      /*methodName=*/"getLowerBound",
+      /*args=*/(ins "OpBuilder &":$b)
+    >,
+    InterfaceMethod<[{
+        Return the steps as values.
+      }],
+      /*retTy=*/"SmallVector<Value>",
+      /*methodName=*/"getStep",
+      /*args=*/(ins "OpBuilder &":$b)
+    >,
+    InterfaceMethod<[{
+        Return the upper bounds as values.
+      }],
+      /*retTy=*/"SmallVector<Value>",
+      /*methodName=*/"getUpperBound",
+      /*args=*/(ins "OpBuilder &":$b)
+    >,
+    InterfaceMethod<[{
+        Set the lower bounds from an array of `OpFoldResult`.
+      }],
+      /*retTy=*/"void",
+      /*methodName=*/"setMixedLowerBounds",
+      /*args=*/(ins "OpBuilder &":$b, "ArrayRef<OpFoldResult>":$lbs)
+    >,
+    InterfaceMethod<[{
+        Set the steps from an array of `OpFoldResult`.
+      }],
+      /*retTy=*/"void",
+      /*methodName=*/"setMixedSteps",
+      /*args=*/(ins "OpBuilder &":$b, "ArrayRef<OpFoldResult>":$lbs)
+    >,
+    InterfaceMethod<[{
+        Set the upper bounds from an array of `OpFoldResult`.
+      }],
+      /*retTy=*/"void",
+      /*methodName=*/"setMixedUpperBounds",
+      /*args=*/(ins "OpBuilder &":$b, "ArrayRef<OpFoldResult>":$lbs)
+    >,
+    InterfaceMethod<[{
+        Set the lower bounds from an array of values.
+      }],
+      /*retTy=*/"void",
+      /*methodName=*/"setLowerBounds",
+      /*args=*/(ins "ArrayRef<Value>":$lbs)
+    >,
+    InterfaceMethod<[{
+        Set the steps from an array of values.
+      }],
+      /*retTy=*/"void",
+      /*methodName=*/"setSteps",
+      /*args=*/(ins "ArrayRef<Value>":$lbs)
+    >,
+    InterfaceMethod<[{
+        Set the upper bounds from an array of values.
+      }],
+      /*retTy=*/"void",
+      /*methodName=*/"setUpperBounds",
+      /*args=*/(ins "ArrayRef<Value>":$lbs)
+    >,
+    InterfaceMethod<[{
+        Checks if the lower bounds are zeros and steps are ones.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"isNormalized",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        auto allEqual = [](ArrayRef<OpFoldResult> results, int64_t val) {
+          return llvm::all_of(results, [&](OpFoldResult ofr) {
+            auto intValue = getConstantIntValue(ofr);
+            return intValue.has_value() && intValue == val;
+          });
+        };
+        SmallVector<::mlir::OpFoldResult> lbs = $_op.getMixedLowerBound();
+        SmallVector<::mlir::OpFoldResult> steps = $_op.getMixedStep();
+        return allEqual(lbs, 0) && allEqual(steps, 1);
+      }]
+    >
+  ];
+
+  let verify = [{
+    return detail::verifyLoopLikeWithInductionVarsOpInterface($_op);
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Traits
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index 58bd61b2ae8b8..755ec7ecdfbad 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -82,6 +82,10 @@ std::unique_ptr<Pass> createLoopInvariantCodeMotionPass();
 /// Creates a pass that hoists loop-invariant subset ops.
 std::unique_ptr<Pass> createLoopInvariantSubsetHoistingPass();
 
+/// Create a pass that normalizes the loop bounds of loop-like operations with
+/// induction variables.
+std::unique_ptr<Pass> createNormalizeLoopBoundsPass();
+
 /// Creates a pass to strip debug information from a function.
 std::unique_ptr<Pass> createStripDebugInfoPass();
 
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 1b40a87c63f27..5d1256e502a12 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -377,6 +377,12 @@ def Mem2Reg : Pass<"mem2reg"> {
   ];
 }
 
+def NormalizeLoopBounds : Pass<"normalize-loop-bounds"> {
+  let summary = "Normalize the loop bounds of loop-like operations with "
+                "induction variables.";
+  let constructor = "mlir::createNormalizeLoopBoundsPass()";
+}
+
 def PrintOpStats : Pass<"print-op-stats"> {
   let summary = "Print statistics of operations";
   let constructor = "mlir::createPrintOpStatsPass()";
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 107fd0690f193..3e7becb094b6b 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1387,6 +1387,66 @@ void ForallOp::build(
   build(b, result, lbs, ubs, steps, outputs, mapping, bodyBuilderFn);
 }
 
+/// Set the lower bounds from `OpFoldResult`.
+void ForallOp::setMixedLowerBounds(OpBuilder &b, ArrayRef<OpFoldResult> lbs) {
+  SmallVector<int64_t> staticLbs;
+  SmallVector<Value> dynamicLbs;
+  dispatchIndexOpFoldResults(lbs, dynamicLbs, staticLbs);
+  getOperation()->setOperands(0, getDynamicLowerBound().size(), dynamicLbs);
+  (*this)->setAttr(getStaticLowerBoundAttrName(),
+                   b.getDenseI64ArrayAttr(staticLbs));
+  ArrayRef<int32_t> segmentSizes =
+      (*this)
+          ->getAttrOfType<DenseI32ArrayAttr>("operandSegmentSizes")
+          .asArrayRef();
+  SmallVector<int32_t> newSegmentSizes(segmentSizes.begin(),
+                                       segmentSizes.end());
+  newSegmentSizes[0] = dynamicLbs.size();
+  (*this)->setAttr("operandSegmentSizes",
+                   b.getDenseI32ArrayAttr(newSegmentSizes));
+}
+
+/// Set the upper bounds from `OpFoldResult`.
+void ForallOp::setMixedUpperBounds(OpBuilder &b, ArrayRef<OpFoldResult> ubs) {
+  SmallVector<int64_t> staticUbs;
+  SmallVector<Value> dynamicUbs;
+  dispatchIndexOpFoldResults(ubs, dynamicUbs, staticUbs);
+  size_t offset = getDynamicLowerBound().size();
+  getOperation()->setOperands(offset, getDynamicUpperBound().size(),
+                              dynamicUbs);
+  (*this)->setAttr(getStaticUpperBoundAttrName(),
+                   b.getDenseI64ArrayAttr(staticUbs));
+  ArrayRef<int32_t> segmentSizes =
+      (*this)
+          ->getAttrOfType<DenseI32ArrayAttr>("operandSegmentSizes")
+          .asArrayRef();
+  SmallVector<int32_t> newSegmentSizes(segmentSizes.begin(),
+                                       segmentSizes.end());
+  newSegmentSizes[1] = dynamicUbs.size();
+  (*this)->setAttr("operandSegmentSizes",
+                   b.getDenseI32ArrayAttr(newSegmentSizes));
+}
+
+/// Set the steps from `OpFoldResult`.
+void ForallOp::setMixedSteps(OpBuilder &b, ArrayRef<OpFoldResult> steps) {
+  SmallVector<int64_t> staticSteps;
+  SmallVector<Value> dynamicSteps;
+  dispatchIndexOpFoldResults(steps, dynamicSteps, staticSteps);
+  size_t offset = getDynamicLowerBound().size() + getDynamicUpperBound().size();
+  getOperation()->setOperands(offset, getDynamicStep().size(), dynamicSteps);
+  (*this)->setAttr(getStaticStepAttrName(),
+                   b.getDenseI64ArrayAttr(staticSteps));
+  ArrayRef<int32_t> segmentSizes =
+      (*this)
+          ->getAttrOfType<DenseI32ArrayAttr>("operandSegmentSizes")
+          .asArrayRef();
+  SmallVector<int32_t> newSegmentSizes(segmentSizes.begin(),
+                                       segmentSizes.end());
+  newSegmentSizes[2] = dynamicSteps.size();
+  (*this)->setAttr("operandSegmentSizes",
+                   b.getDenseI32ArrayAttr(newSegmentSizes));
+}
+
 // Checks if the lbs are zeros and steps are ones.
 bool ForallOp::isNormalized() {
   auto allEqual = [](ArrayRef<OpFoldResult> results, int64_t val) {
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index 6658cca03eba7..41f52cb84f4ed 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Utils/LoopUtils.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/PatternMatch.h"
@@ -29,16 +30,6 @@
 
 using namespace mlir;
 
-namespace {
-// This structure is to pass and return sets of loop parameters without
-// confusing the order.
-struct LoopParams {
-  Value lowerBound;
-  Value upperBound;
-  Value step;
-};
-} // namespace
-
 SmallVector<scf::ForOp> mlir::replaceLoopNestWithNewYields(
     RewriterBase &rewriter, MutableArrayRef<scf::ForOp> loopNest,
     ValueRange newIterOperands, const NewYieldValuesFn &newYieldValuesFn,
@@ -473,50 +464,6 @@ LogicalResult mlir::loopUnrollByFactor(
   return success();
 }
 
-/// Transform a loop with a strictly positive step
-///   for %i = %lb to %ub step %s
-/// into a 0-based loop with step 1
-///   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
-///     %i = %ii * %s + %lb
-/// Insert the induction variable remapping in the body of `inner`, which is
-/// expected to be either `loop` or another loop perfectly nested under `loop`.
-/// Insert the definition of new bounds immediate before `outer`, which is
-/// expected to be either `loop` or its parent in the loop nest.
-static LoopParams emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc,
-                                           Value lb, Value ub, Value step) {
-  // For non-index types, generate `arith` instructions
-  // Check if the loop is already known to have a constant zero lower bound or
-  // a constant one step.
-  bool isZeroBased = false;
-  if (auto lbCst = getConstantIntValue(lb))
-    isZeroBased = lbCst.value() == 0;
-
-  bool isStepOne = false;
-  if (auto stepCst = getConstantIntValue(step))
-    isStepOne = stepCst.value() == 1;
-
-  // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
-  // assuming the step is strictly positive.  Update the bounds and the step
-  // of the loop to go from 0 to the number of iterations, if necessary.
-  if (isZeroBased && isStepOne)
-    return {lb, ub, step};
-
-  Value diff = isZeroBased ? ub : rewriter.create<arith::SubIOp>(loc, ub, lb);
-  Value newUpperBound =
-      isStepOne ? diff : rewriter.create<arith::CeilDivSIOp>(loc, diff, step);
-
-  Value newLowerBound = isZeroBased
-                            ? lb
-                            : rewriter.create<arith::ConstantOp>(
-                                  loc, rewriter.getZeroAttr(lb.getType()));
-  Value newStep = isStepOne
-                      ? step
-                      : rewriter.create<arith::ConstantOp>(
-                            loc, rewriter.getIntegerAttr(step.getType(), 1));
-
-  return {newLowerBound, newUpperBound, newStep};
-}
-
 /// Get back the original induction variable values after loop normalization
 static void denormalizeInductionVariable(RewriterBase &rewriter, Location loc,
                                          Value normalizedIv, Value origLb,
diff --git a/mlir/lib/Dialect/Utils/CMakeLists.txt b/mlir/lib/Dialect/Utils/CMakeLists.txt
index a0096e5f299d5..41b2fe287beb3 100644
--- a/mlir/lib/Dialect/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Utils/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(MLIRDialectUtils
   IndexingUtils.cpp
+  LoopUtils.cpp
   ReshapeOpsUtils.cpp
   StructuredOpsUtils.cpp
   StaticValueUtils.cpp
diff --git a/mlir/lib/Dialect/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Utils/LoopUtils.cpp
new file mode 100644
index 0000000000000..3d8aa5ef7dfc1
--- /dev/null
+++ b/mlir/lib/Dialect/Utils/LoopUtils.cpp
@@ -0,0 +1,52 @@
+//===- LoopUtils.cpp - Helpers related to loop operations -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Utils/LoopUtils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+
+using namespace mlir;
+
+/// Calculate the normalized loop upper bounds with lower bound equal to zero
+/// and step equal to one.
+LoopParams mlir::emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc,
+                                          Value lb, Value ub, Value step) {
+  // For non-index types, generate `arith` instructions
+  // Check if the loop is already known to have a constant zero lower bound or
+  // a constant one step.
+  bool isZeroBased = false;
+  if (auto lbCst = getConstantIntValue(lb))
+    isZeroBased = lbCst.value() == 0;
+
+  bool isStepOne = false;
+  if (auto stepCst = getConstantIntValue(step))
+    isStepOne = stepCst.value() == 1;
+
+  // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
+  // assuming the step is strictly positive.  Update the bounds and the step
+  // of the loop to go from 0 to the number of iterations, if necessary.
+  if (isZeroBased && isStepOne)
+    return {lb, ub, step};
+
+  Value diff =
+      isZeroBased ? ub : rewriter.createOrFold<arith::SubIOp>(loc, ub, lb);
+  Value newUpperBound =
+      isStepOne ? diff
+                : rewriter.createOrFold<arith::CeilDivSIOp>(loc, diff, step);
+
+  Value newLowerBound = isZeroBased
+                            ? lb
+                            : rewriter.create<arith::ConstantOp>(
+                                  loc, rewriter.getZeroAttr(lb.getType()));
+  Value newStep = isStepOne
+                      ? step
+                      : rewriter.create<arith::ConstantOp>(
+                            loc, rewriter.getIntegerAttr(step.getType(), 1));
+
+  return {newLowerBound, newUpperBound, newStep};
+}
diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
index b51357198b1ca..5454411bc535b 100644
--- a/mlir/lib/IR/Operation.cpp
+++ b/mlir/lib/IR/Operation.cpp
@@ -245,8 +245,6 @@ void Operation::setOperands(ValueRange operands) {
 /// than the range pointed to by 'start'+'length'.
 void Operation::setOperands(unsigned start, unsigned length,
                             ValueRange operands) {
-  assert((start + length) <= getNumOperands() &&
-         "invalid operand range specified");
   if (LLVM_LIKELY(hasOperandStorage))
     return getOperandStorage().setOperands(this, start, length, operands);
   assert(operands.empty() && "setting operands without an operand storage");
diff --git a/mlir/lib/Interfaces/LoopLikeInterface.cpp b/mlir/lib/Interfaces/LoopLikeInterface.cpp
index 1e0e87b64e811..3f478b9bc0b96 100644
--- a/mlir/lib/Interfaces/LoopLikeInterface.cpp
+++ b/mlir/lib/Interfaces/LoopLikeInterface.cpp
@@ -113,3 +113,28 @@ LogicalResult detail::verifyLoopLikeOpInterface(Operation *op) {
 
   return success();
 }
+
+LogicalResult
+detail::verifyLoopLikeWithInductionVarsOpInterface(Operation *op) {
+  auto loopLikeOp = cast<LoopLikeWithInductionVarsOpInterface>(op);
+
+  // Verify number of induction variables, lower bounds, upper bounds and steps.
+  if (loopLikeOp.getInductionVars().size() !=
+      loopLikeOp.getMixedLowerBound().size())
+    return op->emitOpError(
+               "different number of induction variables and lower bounds: ")
+           << loopLikeOp.getInductionVars().size()
+           << " != " << loopLikeOp.getMixedLowerBound().size();
+  if (loopLikeOp.getInductionVars().size() != loopLikeOp.getMixedStep().size())
+    return op->emitOpError(
+               "different number of induction variables and steps: ")
+           << loopLikeOp.getInductionVars().size()
+           << " != " << loopLikeOp.getMixedStep().size();
+  if (loopLikeOp.getInductionVars().size() !=
+      loopLikeOp.getMixedUpperBound().size())
+    return op->emitOpError(
+               "different number of induction variables and upper bounds: ")
+           << loopLikeOp.getInductionVars().size()
+           << " != " << loopLikeOp.getMixedUpperBound().size();
+  return success();
+}
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
index 90c0298fb5e46..dc24da367ca48 100644
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_library(MLIRTransforms
   LocationSnapshot.cpp
   LoopInvariantCodeMotion.cpp
   Mem2Reg.cpp
+  NormalizeLoopBounds.cpp
   OpStats.cpp
   PrintIR.cpp
   RemoveDeadValues.cpp
@@ -30,6 +31,7 @@ add_mlir_library(MLIRTransforms
   LINK_LIBS PUBLIC
   MLIRAnalysis
   MLIRCopyOpInterface
+  MLIRDialectUtils
   MLIRFunctionInterfaces
   MLIRLoopLikeInterface
   MLIRMemorySlotInterfaces
diff --git a/mlir/lib/Transforms/NormalizeLoopBounds.cpp b/mlir/lib/Transforms/NormalizeLoopBounds.cpp
new file mode 100644
index 0000000000000..ffb51b05ce1ca
--- /dev/null
+++ b/mlir/lib/Transforms/NormalizeLoopBounds.cpp
@@ -0,0 +1,118 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Utils/LoopUtils.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_NORMALIZELOOPBOUNDS
+#include "mlir/Transforms/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+/// Normalize a loop-like operation with induction variables, i.e. calculate
+/// new normalized upper bounds for lower bounds equal to zero and step sizes
+/// equal to one. Then, insert new `affine.apply` operations to calculate the
+/// denormalized index values and update all usage from the original induction
+/// variables to the results of the `affine.apply` operations.
+///
+/// Example:
+/// Transform a `scf.forall` loop with a strictly positive steps
+///   forall (%i, %j) = (%lb0, %lb1) to (%ub0, %ub1) step (%s0, %s1)
+/// into a 0-based loop with step 1
+///   forall (%i, %j) in (ceildiv(%ub0 - %lb0, %s0), ceildiv(%ub1 - %lb1, %s1))
+LogicalResult
+normalizeLoopBounds(RewriterBase &rewriter,
+                    LoopLikeWithInductionVarsOpInterface loopLikeOp) {
+  OpBuilder::InsertionGuard g(rewriter);
+  if (loopLikeOp.isNormalized())
+    return success();
+
+  SmallVector<Value> newLbs;
+  SmallVector<Value> newUbs;
+  SmallVector<Value> newSteps;
+  rewriter.setInsertionPoint(loopLikeOp);
+  for (auto &&[iv, lb, ub, step] : llvm::zip(
+           loopLikeOp.getInductionVars(), loopLikeOp.getLowerBound(rewriter),
+           loopLikeOp.getUpperBound(rewriter), loopLikeOp.getStep(rewriter))) {
+    std::optional<int64_t> lbInt = getConstantIntValue(lb);
+    std::optional<int64_t> stepInt = getConstantIntValue(step);
+
+    rewriter.setInsertionPoint(loopLikeOp);
+    auto newLoopParams =
+        emitNormalizedLoopBounds(rewriter, loopLikeOp.getLoc(), lb, ub, step);
+
+    newLbs.push_back(newLoopParams.lowerBound);
+    newUbs.push_back(newLoopParams.upperBound);
+    newSteps.push_back(newLoopParams.step);
+
+    Region &region = loopLikeOp.getOperation()->getRegion(0);
+    rewriter.setInsertionPointToStart(&region.front());
+    SmallVector<Value> operands = {iv};
+    AffineExpr idxExpr, stepExpr, offsetExpr, res;
+    if (!lbInt && !stepInt) {
+      bindDims(loopLikeOp.getContext(), idxExpr, stepExpr, offsetExpr);
+      res = idxExpr * stepExpr + offsetExpr;
+      operands.push_back(step);
+      operands.push_back(lb);
+    } else if (!lbInt) {
+      bindDims(loopLikeOp.getContext(), idxExpr, offsetExpr);
+      res = idxExpr * stepInt.value() + offsetExpr;
+      operands.push_back(lb);
+    } else if (!stepInt) {
+      bindDims(loopLikeOp.getContext(), idxExpr, stepExpr);
+      res = idxExpr * stepExpr + lbInt.value();
+      operands.push_back(step);
+    } else {
+      bindDims(loopLikeOp.getContext(), idxExpr);
+      res = idxExpr * stepInt.value() + lbInt.value();
+    }
+
+    auto affineApply = rewriter.create<affine::AffineApplyOp>(
+        loopLikeOp.getLoc(), res, operands);
+    SmallPtrSet<Operation *, 2> preserve(
+        {iv.getDefiningOp(), affineApply.getOperation()});
+    rewriter.replaceAllUsesExcept(iv, affineApply.getResult(), preserve);
+  }
+
+  rewriter.setInsertionPoint(loopLikeOp);
+  rewriter.modifyOpInPlace(loopLikeOp, [&]() {
+    loopLikeOp.setLowerBounds(newLbs);
+    loopLikeOp.setUpperBounds(newUbs);
+    loopLikeOp.setSteps(newSteps);
+  });
+  return success();
+}
+
+namespace {
+
+/// Pass which normalizes the loop bounds of operations implementing
+/// `LoopLikeWithInductionVarsOpInterface`.
+struct NormalizeLoopBounds
+    : public impl::NormalizeLoopBoundsBase<NormalizeLoopBounds> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<affine::AffineDialect>();
+  }
+
+  void runOnOperation() override {
+    Operation *parentOp = getOperation();
+    IRRewriter rewriter(parentOp->getContext());
+
+    parentOp->walk([&](LoopLikeWithInductionVarsOpInterface loopLikeOp) {
+      (void)normalizeLoopBounds(rewriter, loopLikeOp);
+    });
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::createNormalizeLoopBoundsPass() {
+  return std::make_unique<NormalizeLoopBounds>();
+}
diff --git a/mlir/test/Dialect/Affine/loop-coalescing.mlir b/mlir/test/Dialect/Affine/loop-coalescing.mlir
index ae0adf5a0a02d..0a96e01162d48 100644
--- a/mlir/test/Dialect/Affine/loop-coalescing.mlir
+++ b/mlir/test/Dialect/Affine/loop-coalescing.mlir
@@ -72,19 +72,16 @@ func.func @multi_use() {
   return
 }
 
-func.func @unnormalized_loops() {
+// CHECK: %[[orig_ub_i:.*]]: index, %[[orig_ub_j:.*]]: index
+func.func @unnormalized_loops(%ubi: index, %ubj: index) {
   // CHECK: %[[orig_step_i:.*]] = arith.constant 2
   // CHECK: %[[orig_step_j:.*]] = arith.constant 3
   // CHECK: %[[orig_lb_i:.*]] = arith.constant 5
   // CHECK: %[[orig_lb_j:.*]] = arith.constant 7
-  // CHECK: %[[orig_ub_i:.*]] = arith.constant 10
-  // CHECK: %[[orig_ub_j:.*]] = arith.constant 17
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
   %c5 = arith.constant 5 : index
   %c7 = arith.constant 7 : index
-  %c10 = arith.constant 10 : index
-  %c17 = arith.constant 17 : index
 
   // Number of iterations in the outer scf.
   // CHECK: %[[diff_i:.*]] = arith.subi %[[orig_ub_i]], %[[orig_lb_i]]
@@ -101,10 +98,10 @@ func.func @unnormalized_loops() {
   // New bounds of the outer scf.
   // CHECK: %[[range:.*]] = arith.muli %[[numiter_i]], %[[numiter_j]]
   // CHECK: scf.for %[[i:.*]] = %[[lb_i]] to %[[range]] step %[[step_i]]
-  scf.for %i = %c5 to %c10 step %c2 {
+  scf.for %i = %c5 to %ubi step %c2 {
     // The inner loop has been removed.
     // CHECK-NOT: scf.for
-    scf.for %j = %c7 to %c17 step %c3 {
+    scf.for %j = %c7 to %ubj step %c3 {
       // The IVs are rewritten.
       // CHECK: %[[normalized_j:.*]] = arith.remsi %[[i]], %[[numiter_j]]
       // CHECK: %[[normalized_i:.*]] = arith.divsi %[[i]], %[[numiter_j]]
diff --git a/mlir/test/Transforms/normalize-loop-bounds.mlir b/mlir/test/Transforms/normalize-loop-bounds.mlir
new file mode 100644
index 0000000000000..5130f4282b36b
--- /dev/null
+++ b/mlir/test/Transforms/normalize-loop-bounds.mlir
@@ -0,0 +1,266 @@
+// RUN: mlir-opt %s  -split-input-file -normalize-loop-bounds -verify-diagnostics | FileCheck %s
+
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 + 2)>
+// CHECK-LABEL: func.func @for_lowerbound_static
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK:       scf.for %[[ARG:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK-NEXT:    affine.apply #[[$MAP]](%[[ARG]])
+module {
+  func.func @for_lowerbound_static() {
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    scf.for %arg0 = %c2 to %c8 step %c1 {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0, d1) -> (d0 + d1)>
+// CHECK-LABEL: func.func @for_lowerbound_dynamic
+// CHECK-SAME:  %[[ARG0:.+]]: index
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[UB:.+]] = arith.subi %[[C8]], %[[ARG0]] : index
+// CHECK:       scf.for %[[ARG:.+]] = %[[C0]] to %[[UB]] step %[[C1]]
+// CHECK-NEXT:    affine.apply #[[$MAP]](%[[ARG]], %[[ARG0]])
+module {
+  func.func @for_lowerbound_dynamic(%lb: index) {
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    scf.for %arg0 = %lb to %c8 step %c1 {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 2)>
+// CHECK-LABEL: func.func @for_step_static
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+// CHECK:       scf.for %[[ARG:.+]] = %[[C0]] to %[[C4]] step %[[C1]]
+// CHECK-NEXT:    affine.apply #[[$MAP]](%[[ARG]])
+module {
+  func.func @for_step_static() {
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    scf.for %arg0 = %c0 to %c8 step %c2 {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0, d1) -> (d0 * d1)>
+// CHECK-LABEL: func.func @for_step_dynamic
+// CHECK-SAME:  %[[ARG0:.+]]: index
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[UB:.+]] = arith.ceildivsi %[[C8]], %[[ARG0]] : index
+// CHECK:       scf.for %[[ARG:.+]] = %[[C0]] to %[[UB]] step %[[C1]]
+// CHECK-NEXT:    affine.apply #[[$MAP]](%[[ARG]], %[[ARG0]])
+module {
+  func.func @for_step_dynamic(%step: index) {
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    scf.for %arg0 = %c0 to %c8 step %step {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 4 + 1)>
+// CHECK-LABEL: func.func @for_lowerbound_and_step_static
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK:       scf.for %[[ARG:.+]] = %[[C0]] to %[[C3]] step %[[C1]]
+// CHECK-NEXT:    affine.apply #[[$MAP]](%[[ARG]])
+module {
+  func.func @for_lowerbound_and_step_static() {
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c13 = arith.constant 13 : index
+    scf.for %arg0 = %c1 to %c13 step %c4 {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0 * d1 + d2)>
+// CHECK-LABEL: func.func @for_lowerbound_and_step_dynamic
+// CHECK-SAME:  %[[LB:.+]]: index, %[[STEP:.+]]: index
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C13:.+]] = arith.constant 13 : index
+// CHECK-DAG:   %[[SUB:.+]] = arith.subi %[[C13]], %[[LB]] : index
+// CHECK-DAG:   %[[UB:.+]] = arith.ceildivsi %[[SUB]], %[[STEP]] : index
+// CHECK:       scf.for %[[ARG:.+]] = %[[C0]] to %[[UB]] step %[[C1]]
+// CHECK-NEXT:    affine.apply #[[$MAP]](%[[ARG]], %[[STEP]], %[[LB]])
+module {
+  func.func @for_lowerbound_and_step_dynamic(%lb: index, %step: index) {
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c13 = arith.constant 13 : index
+    scf.for %arg0 = %lb to %c13 step %step {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-DAG:   #[[$MAP0:.+]] = affine_map<(d0) -> (d0 + 4)>
+// CHECK-DAG:   #[[$MAP1:.+]] = affine_map<(d0) -> (d0 + 2)>
+// CHECK-LABEL: func.func @forall_lowerbound_static
+// CHECK:       scf.forall (%[[ARG0:.+]], %[[ARG1:.+]]) in (6, 12)
+// CHECK-DAG:     affine.apply #[[$MAP1]](%[[ARG0]])
+// CHECK-DAG:     affine.apply #[[$MAP0]](%[[ARG1]])
+module {
+  func.func @forall_lowerbound_static() {
+    scf.forall (%arg2, %arg3) = (2, 4) to (8, 16) step (1, 1) {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-DAG:   #[[$MAP0:.+]] = affine_map<(d0, d1) -> (d0 + d1)>
+// CHECK-LABEL: func.func @forall_lowerbound_dynamic
+// CHECK-SAME:  %[[LB0:.+]]: index, %[[LB1:.+]]: index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[UB0:.+]] = arith.subi %[[C8]], %[[LB0]] : index
+// CHECK-DAG:   %[[UB1:.+]] = arith.subi %[[C16]], %[[LB1]] : index
+// CHECK:       scf.forall (%[[ARG0:.+]], %[[ARG1:.+]]) in (%[[UB0]], %[[UB1]])
+// CHECK-DAG:     affine.apply #[[$MAP0]](%[[ARG0]], %[[LB0]])
+// CHECK-DAG:     affine.apply #[[$MAP0]](%[[ARG1]], %[[LB1]])
+module {
+  func.func @forall_lowerbound_dynamic(%lb0: index, %lb1: index) {
+    scf.forall (%arg2, %arg3) = (%lb0, %lb1) to (8, 16) step (1, 1) {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 8)>
+// CHECK-LABEL: func.func @forall_step_static
+// CHECK:       scf.forall (%[[ARG0:.+]], %[[ARG1:.+]]) in (1, 2)
+// CHECK-DAG:     affine.apply #[[$MAP]](%[[ARG0]])
+// CHECK-DAG:     affine.apply #[[$MAP]](%[[ARG1]])
+module {
+  func.func @forall_step_static() {
+    scf.forall (%arg2, %arg3) = (0, 0) to (8, 16) step (8, 8) {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       #[[$MAP:.+]] = affine_map<(d0, d1) -> (d0 * d1)>
+// CHECK-LABEL: func.func @forall_step_dynamic
+// CHECK-SAME:  %[[STEP0:.+]]: index, %[[STEP1:.+]]: index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[UB0:.+]] = arith.ceildivsi %[[C8]], %[[STEP0]] : index
+// CHECK-DAG:   %[[UB1:.+]] = arith.ceildivsi %[[C16]], %[[STEP1]] : index
+// CHECK:       scf.forall (%[[ARG0:.+]], %[[ARG1:.+]]) in (%[[UB0]], %[[UB1]])
+// CHECK-DAG:     affine.apply #[[$MAP]](%[[ARG0]], %[[STEP0]])
+// CHECK-DAG:     affine.apply #[[$MAP]](%[[ARG1]], %[[STEP1]])
+module {
+  func.func @forall_step_dynamic(%step0: index, %step1: index) {
+    scf.forall (%arg2, %arg3) = (0, 0) to (8, 16) step (%step0, %step1) {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-DAG:   #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4 + 4)>
+// CHECK-DAG:   #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 2 + 2)>
+// CHECK-LABEL: func.func @forall_lowerbound_and_step_static
+// CHECK:       scf.forall (%[[ARG0:.+]], %[[ARG1:.+]]) in (3, 3)
+// CHECK-DAG:     affine.apply #[[$MAP1]](%[[ARG0]])
+// CHECK-DAG:     affine.apply #[[$MAP0]](%[[ARG1]])
+module {
+  func.func @forall_lowerbound_and_step_static() {
+    scf.forall (%arg2, %arg3) = (2, 4) to (8, 16) step (2, 4) {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-DAG:   #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0 * d1 + d2)>
+// CHECK-LABEL: func.func @forall_lowerbound_and_step_dynamic
+// CHECK-SAME:  %[[LB0:.+]]: index, %[[LB1:.+]]: index, %[[STEP0:.+]]: index, %[[STEP1:.+]]: index
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:   %[[SUB0:.+]] = arith.subi %[[C8]], %[[LB0]] : index
+// CHECK-DAG:   %[[SUB1:.+]] = arith.subi %[[C16]], %[[LB1]] : index
+// CHECK-DAG:   %[[UB0:.+]] = arith.ceildivsi %[[SUB0]], %[[STEP0]] : index
+// CHECK-DAG:   %[[UB1:.+]] = arith.ceildivsi %[[SUB1]], %[[STEP1]] : index
+// CHECK:       scf.forall (%[[ARG0:.+]], %[[ARG1:.+]]) in (%[[UB0]], %[[UB1]])
+// CHECK-DAG:     affine.apply #[[$MAP]](%[[ARG0]], %[[STEP0]], %[[LB0]])
+// CHECK-DAG:     affine.apply #[[$MAP]](%[[ARG1]], %[[STEP1]], %[[LB1]])
+module {
+  func.func @forall_lowerbound_and_step_dynamic(%lb0: index, %lb1: index, %step0: index, %step1: index) {
+    scf.forall (%arg2, %arg3) = (%lb0, %lb1) to (8, 16) step (%step0, %step1) {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-DAG:   #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 4 + 2)>
+// CHECK-LABEL: func.func @forall_with_shared_outs_static
+// CHECK-SAME:  %[[OUT:.+]]: tensor<200x100xf32>
+// CHECK:       scf.forall (%[[ARG0:.+]]) in (2) shared_outs(%{{.+}} = %[[OUT]])
+// CHECK-DAG:     affine.apply #[[$MAP]](%[[ARG0]])
+module {
+  func.func @forall_with_shared_outs_static(%out: tensor<200x100xf32>) {
+    scf.forall (%arg0) = (2) to (8) step (4) shared_outs (%o = %out) -> tensor<200x100xf32> {
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-DAG:   #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0 * d1 + d2)>
+// CHECK-LABEL: func.func @forall_with_shared_outs_dynamic
+// CHECK-SAME:  %[[LB:.+]]: index, %[[STEP:.+]]: index, %[[OUT:.+]]: tensor<200x100xf32>
+// CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:   %[[SUB:.+]] = arith.subi %[[C8]], %[[LB]] : index
+// CHECK-DAG:   %[[UB:.+]] = arith.ceildivsi %[[SUB]], %[[STEP]] : index
+// CHECK:       scf.forall (%[[ARG:.+]]) in (%[[UB]]) shared_outs(%{{.+}} = %[[OUT]])
+// CHECK-DAG:     affine.apply #[[$MAP]](%[[ARG]], %[[STEP]], %[[LB]])
+module {
+  func.func @forall_with_shared_outs_dynamic(%lb: index, %step: index, %out: tensor<200x100xf32>) {
+    scf.forall (%arg0) = (%lb) to (8) step (%step) shared_outs (%o = %out) -> tensor<200x100xf32> {
+    }
+    return
+  }
+}