[Mlir-commits] [mlir] Revert "[mlir][SCF] Allow using a custom operation to generate loops with `mlir::tileUsingSCF`." (PR #159598)

Thu Sep 18 09:30:32 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-mlir

Author: None (MaheshRavishankar)

<details>
<summary>Changes</summary>

Reverts llvm/llvm-project#159506

It was committed by accident. Reverting it for reviews.

---

Patch is 61.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159598.diff


5 Files Affected:

- (modified) mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h (+15-111) 
- (modified) mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp (+195-317) 
- (removed) mlir/test/Interfaces/TilingInterface/tile-using-custom-op.mlir (-60) 
- (modified) mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp (-148) 
- (modified) mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td (-23) 


``````````diff

diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index 6b05ade37881c..3205da6e448fc 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -33,14 +33,6 @@ using SCFTileSizeComputationFunction =
 
 /// Options to use to control tiling.
 struct SCFTilingOptions {
-  /// Specify which loop construct to use for tile and fuse.
-  enum class LoopType { ForOp, ForallOp, CustomOp };
-  LoopType loopType = LoopType::ForOp;
-  SCFTilingOptions &setLoopType(LoopType type) {
-    loopType = type;
-    return *this;
-  }
-
   /// Computation function that returns the tile sizes to use for each loop.
   /// Returning a tile size of zero implies no tiling for that loop. If the
   /// size of the returned vector is smaller than the number of loops, the inner
@@ -58,17 +50,6 @@ struct SCFTilingOptions {
   /// proper interaction with folding.
   SCFTilingOptions &setTileSizes(ArrayRef<OpFoldResult> tileSizes);
 
-  /// The interchange vector to reorder the tiled loops.
-  SmallVector<int64_t> interchangeVector = {};
-  SCFTilingOptions &setInterchange(ArrayRef<int64_t> interchange) {
-    interchangeVector = llvm::to_vector(interchange);
-    return *this;
-  }
-
-  //-------------------------------------------------------------------------//
-  // Options related to tiling using `scf.forall`.
-  //-------------------------------------------------------------------------//
-
   /// Computation function that returns the number of threads to use for
   /// each loop. Returning a num threads of zero implies no tiling for that
   /// loop. If the size of the returned vector is smaller than the number of
@@ -89,6 +70,21 @@ struct SCFTilingOptions {
   /// function that computes num threads at the point they are needed.
   SCFTilingOptions &setNumThreads(ArrayRef<OpFoldResult> numThreads);
 
+  /// The interchange vector to reorder the tiled loops.
+  SmallVector<int64_t> interchangeVector = {};
+  SCFTilingOptions &setInterchange(ArrayRef<int64_t> interchange) {
+    interchangeVector = llvm::to_vector(interchange);
+    return *this;
+  }
+
+  /// Specify which loop construct to use for tile and fuse.
+  enum class LoopType { ForOp, ForallOp };
+  LoopType loopType = LoopType::ForOp;
+  SCFTilingOptions &setLoopType(LoopType type) {
+    loopType = type;
+    return *this;
+  }
+
   /// Specify mapping of loops to devices. This is only respected when the loop
   /// constructs support such a mapping (like `scf.forall`). Will be ignored
   /// when using loop constructs that dont support such a mapping (like
@@ -121,98 +117,6 @@ struct SCFTilingOptions {
     reductionDims.insert(dims.begin(), dims.end());
     return *this;
   }
-
-  //-------------------------------------------------------------------------//
-  // Options related to tiling using custom loop.
-  //-------------------------------------------------------------------------//
-
-  // For generating the inter-tile loops using a custom loop, two callback
-  // functions are needed
-  // 1. That generates the "loop header", i.e. the loop that iterates over the
-  //    different tiles.
-  // 2. That generates the loop terminator
-  //
-  // For `scf.forall` case the call back to generate loop header would generate
-  //
-  // ```mlir
-  // scf.forall (...) = ... {
-  //   ..
-  // }
-  // ```
-  //
-  // and the call back to generate the loop terminator would generate the
-  // `scf.in_parallel` region
-  //
-  // ```mlir
-  // scf.forall (...) = ... {
-  //   scf.in_parallel {
-  //      tensor.parallel_insert_slice ...
-  //   }
-  // }
-  // ```
-  //
-
-  // Information that is to be returned by the callback to generate the loop
-  // header needed for the rest of the tiled codegeneration.
-  // - `loops`: The generated loops
-  // - `tileOffset`: The values that represent the offset of the iteration space
-  // tile
-  // - `tileSizes` : The values that represent the size of the iteration space
-  // tile.
-  // - `destinationTensors` : The tensors to use as destinations during tiling.
-  struct CustomLoopHeaderInfo {
-    SmallVector<LoopLikeOpInterface> loops;
-    SmallVector<OpFoldResult> tileOffset;
-    SmallVector<OpFoldResult> tileSizes;
-    SmallVector<Value> destinationTensors;
-  };
-
-  // Type of the callback function that generates the loop headers.
-  // - `loopRanges` : Values that represent the full size of the iteration space
-  //                  being tiled.
-  // - `giveTileSizes` : The tile sizes that are to be used to tile the
-  // iteration
-  //                     space.
-  // - `destinationTensors` : The tensors to use as destinations for the results
-  //                          of the tiled loop for loops that implement
-  //                          `DestinationStyleOpInterface`.
-  // Returns the `CustomLoopHeaderInfo` object (described above). it is expected
-  // that this function sets the insertion point of `rewriter` to the program
-  // point where the intra-tile loop computation is to be generated.
-  using GenerateLoopHeaderFn = std::function<FailureOr<CustomLoopHeaderInfo>(
-      RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
-      ArrayRef<OpFoldResult> givenTileSizes, ValueRange destinationTensors)>;
-
-  // Type of the callback function that generates the loop terminator.
-  // - `tiledResults` : Tiles of the result computed for the iteration space
-  // tile
-  // - `resultOffsets` : For each of the `tiledResults`, the offset at which
-  //                     the result tile is to be "inserted" back into the
-  //                     destination tensor.
-  // - `resultSizes` : For each of the `tiledResults`, the size of the result
-  // tile
-  //                   that is to be "inserted" back into the destination
-  //                   tensor.
-  // Returns the `CustomLoopHeaderInfo` object (described above)
-  using GenerateLoopTerminatorFn = std::function<LogicalResult(
-      RewriterBase &rewriter, Location loc, ValueRange tiledResults,
-      ArrayRef<SmallVector<OpFoldResult>> resultOffsets,
-      ArrayRef<SmallVector<OpFoldResult>> resultSizes,
-      ValueRange destinationTensors)>;
-
-  // Callback function to generate the inter-tile loop header.
-  GenerateLoopHeaderFn generateLoopHeaderFn = nullptr;
-  // Callback function to generate the inter-tile loop terminator.
-  GenerateLoopTerminatorFn generateLoopTerminatorFn = nullptr;
-  // Helper function to set the callbacks for inter-tile loop header and
-  // terminator functions when using a custom operation for the loop.
-  SCFTilingOptions &
-  setCustomLoopGenerationFns(GenerateLoopHeaderFn headerFn,
-                             GenerateLoopTerminatorFn terminatorFn) {
-    generateLoopHeaderFn = std::move(headerFn);
-    generateLoopTerminatorFn = std::move(terminatorFn);
-    return *this;
-  }
 };
 
 /// Transformation information returned after tiling.
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index c3899473289e2..834c02126fa53 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -155,18 +155,18 @@ getUserTileSizesAndNumThreads(RewriterBase &rewriter, TilingInterface op,
 static LogicalResult checkTileSizes(TilingInterface op,
                                     scf::SCFTilingOptions::LoopType loopType,
                                     ReductionTilingStrategy reductionStrategy,
-                                    ArrayRef<OpFoldResult> givenTileSizes,
+                                    ArrayRef<OpFoldResult> tileSizes,
                                     ArrayRef<OpFoldResult> numThreads) {
   auto iterators = op.getLoopIteratorTypes();
-  assert(iterators.size() == givenTileSizes.size() &&
+  assert(iterators.size() == tileSizes.size() &&
          "expected as many tile size values as number of loops");
   assert((numThreads.empty() || (numThreads.size() == iterators.size())) &&
          "when specified, expected number of threads to use for each loop");
 
   bool isParallelTiling = false;
-  for (auto [index, iterator, givenTileSize] :
-       llvm::enumerate(iterators, givenTileSizes)) {
-    if (!isConstantIntValue(givenTileSize, 0)) {
+  for (auto [index, iterator, tileSize] :
+       llvm::enumerate(iterators, tileSizes)) {
+    if (!isConstantIntValue(tileSize, 0)) {
       isParallelTiling |= iterator == utils::IteratorType::parallel;
     }
 
@@ -186,7 +186,7 @@ static LogicalResult checkTileSizes(TilingInterface op,
       }
 
       if (std::optional<int64_t> constTileSize =
-              getConstantIntValue(givenTileSize)) {
+              getConstantIntValue(tileSize)) {
         if (constTileSize.value() > 0 &&
             iterator != utils::IteratorType::parallel) {
           op.emitWarning() << "tiling is not thread safe at axis #" << index;
@@ -207,11 +207,11 @@ static LogicalResult checkTileSizes(TilingInterface op,
 /// Get the reduction dims that are tiled. This accounts for reduction dims
 /// that are specified as tiled, but the tile size is 0.
 static SetVector<unsigned>
-getSanitizedReductionDims(ArrayRef<OpFoldResult> givenTileSizes,
+getSanitizedReductionDims(ArrayRef<OpFoldResult> tileSizes,
                           const scf::SCFTilingOptions &options) {
   SetVector<unsigned> reductionDims;
   for (auto dim : options.reductionDims) {
-    if (isConstantIntValue(givenTileSizes[dim], 0))
+    if (isConstantIntValue(tileSizes[dim], 0))
       continue;
     reductionDims.insert(dim);
   }
@@ -236,14 +236,14 @@ static bool tileDividesIterationDomain(Range loopRange) {
 /// `tileSize`, i.e., `min(tileSize, range.end() - offset)`.
 static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc,
                                        Range loopRange, OpFoldResult offset,
-                                       OpFoldResult givenTileSize) {
-  std::optional<int64_t> ts = getConstantIntValue(givenTileSize);
+                                       OpFoldResult tileSize) {
+  std::optional<int64_t> ts = getConstantIntValue(tileSize);
   if (ts && ts.value() == 1)
-    return givenTileSize;
+    return tileSize;
 
   if (tileDividesIterationDomain(
-          Range{loopRange.offset, loopRange.size, givenTileSize}))
-    return givenTileSize;
+          Range{loopRange.offset, loopRange.size, tileSize}))
+    return tileSize;
 
   // The tile size to use (to avoid out of bounds access) is  minimum of
   // `tileSize` and `ub - iv`, where `iv` is the induction variable of the tiled
@@ -254,15 +254,15 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc,
   AffineMap minMap = AffineMap::get(1, 2, {s0 - d0, s1}, b.getContext());
   Value size = getValueOrCreateConstantIndexOp(b, loc, loopRange.size);
   return affine::makeComposedFoldedAffineMin(
-      b, loc, minMap, SmallVector<OpFoldResult>{offset, size, givenTileSize});
+      b, loc, minMap, SmallVector<OpFoldResult>{offset, size, tileSize});
 }
 
 /// Returns true if the maximum tile offset `tileSize * numThreads-1` is less
 /// than `iterationSize`.
-static bool canOmitTileOffsetInBoundsCheck(OpFoldResult givenTileSize,
+static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize,
                                            OpFoldResult numThreads,
                                            OpFoldResult iterationSize) {
-  std::optional<int64_t> tileSizeConst = getConstantIntValue(givenTileSize);
+  std::optional<int64_t> tileSizeConst = getConstantIntValue(tileSize);
   std::optional<int64_t> numThreadsConst = getConstantIntValue(numThreads);
   std::optional<int64_t> iterSizeConst = getConstantIntValue(iterationSize);
   if (!tileSizeConst || !numThreadsConst || !iterSizeConst)
@@ -274,51 +274,114 @@ static bool canOmitTileOffsetInBoundsCheck(OpFoldResult givenTileSize,
 /// `offset`s and `size`s of the tile of the iteration space that the
 /// innermost loop body of the generated tiled loops corresponds to.
 static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>>
-getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs,
+getTileOffsetAndSizes(RewriterBase &rewriter, Location loc,
+                      ReductionTilingStrategy strategy, ValueRange ivs,
                       ArrayRef<Range> iterationDomain,
-                      ArrayRef<OpFoldResult> givenTileSizes) {
+                      ArrayRef<OpFoldResult> tileSizes,
+                      ArrayRef<OpFoldResult> numThreads,
+                      const llvm::SetVector<unsigned> &reductionDims) {
   SmallVector<OpFoldResult> offsets, sizes;
   int materializedLoopNum = 0;
-  for (auto [givenTileSize, loopRange] :
-       llvm::zip_equal(givenTileSizes, iterationDomain)) {
-
-    // Non-tiled cases, set the offset and size to the
-    // `loopRange.offset/size`.
-    if (isZeroInteger(givenTileSize)) {
-      offsets.push_back(loopRange.offset);
-      sizes.push_back(loopRange.size);
-      continue;
+
+  if (!numThreads.empty()) {
+    AffineExpr d0, d1, s0, s1;
+    AffineExpr offsetExpr, residualTileSizeExpr;
+    bindDims(rewriter.getContext(), d0, d1);
+    bindSymbols(rewriter.getContext(), s0, s1);
+    offsetExpr = d0 + d1 * s0;
+    residualTileSizeExpr = s1 - (d0 + d1 * s0);
+
+    for (auto [index, nt, tileSize, loopRange] :
+         llvm::enumerate(numThreads, tileSizes, iterationDomain)) {
+
+      // Non-tiled cases, set the offset and size to the
+      // `loopRange.offset/size`.
+      if (isZeroInteger(nt)) {
+        offsets.push_back(loopRange.offset);
+        sizes.push_back(loopRange.size);
+        continue;
+      }
+
+      Value iv = ivs[materializedLoopNum++];
+      OpFoldResult offset = affine::makeComposedFoldedAffineApply(
+          rewriter, loc, offsetExpr,
+          ArrayRef<OpFoldResult>{loopRange.offset, iv, tileSize});
+      OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply(
+          rewriter, loc, residualTileSizeExpr,
+          {loopRange.offset, nt, tileSize, loopRange.size});
+
+      OpFoldResult size = tileSize;
+      if (!isZeroInteger(residualTileSize)) {
+        OpFoldResult sizeMinusOffsetPerThread =
+            affine::makeComposedFoldedAffineApply(rewriter, loc, s0 - d0,
+                                                  {offset, loopRange.size});
+        size = affine::makeComposedFoldedAffineMin(
+            rewriter, loc,
+            AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()),
+            {sizeMinusOffsetPerThread, tileSize});
+      }
+
+      // Consider the case where the original loop was `[0, 100)`.
+      // If number of threads are `7`, the tile size would be computed as
+      // `ceilDiv(100, 7) = 15`. For the last thread (thread_id = 6)
+      // - `offset = 0 + 6 * 15 = 105`
+      // - `tileSize = min(15, 100 - 105) = -5`
+      // To avoid negative tile sizes, we need to do a further
+      // `nonNegativeTileSize = affine.max(0, tileSize)`.
+      // This `max` can be avoided if
+      //  `offset + tileSize * (numThreads - 1) < (ub - lb)`
+      if (!canOmitTileOffsetInBoundsCheck(tileSize, nt, loopRange.size)) {
+        AffineMap maxMap =
+            AffineMap::getMultiDimIdentityMap(2, rewriter.getContext());
+        size = affine::makeComposedFoldedAffineMax(
+            rewriter, loc, maxMap, {rewriter.getIndexAttr(0), size});
+      }
+
+      offsets.push_back(offset);
+      sizes.push_back(size);
     }
+    return {offsets, sizes};
+  } else {
+    for (auto [tileSize, loopRange] :
+         llvm::zip_equal(tileSizes, iterationDomain)) {
+
+      // Non-tiled cases, set the offset and size to the
+      // `loopRange.offset/size`.
+      if (isZeroInteger(tileSize)) {
+        offsets.push_back(loopRange.offset);
+        sizes.push_back(loopRange.size);
+        continue;
+      }
 
-    Value iv = ivs[materializedLoopNum++];
-    OpFoldResult offset = getAsOpFoldResult(iv);
-    offsets.push_back(offset);
-    OpFoldResult size =
-        getBoundedTileSize(rewriter, loc, loopRange, offset, givenTileSize);
-    sizes.push_back(size);
+      Value iv = ivs[materializedLoopNum++];
+      OpFoldResult offset = getAsOpFoldResult(iv);
+      offsets.push_back(offset);
+      OpFoldResult size =
+          getBoundedTileSize(rewriter, loc, loopRange, offset, tileSize);
+      sizes.push_back(size);
+    }
+    return {offsets, sizes};
   }
-  return {offsets, sizes};
 }
 
 /// Function to return the bounds of the loops to be generated.
 static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>,
                   SmallVector<OpFoldResult>>
 getLoopBounds(RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
-              ArrayRef<OpFoldResult> givenTileSizes) {
+              ArrayRef<OpFoldResult> tileSizes) {
   SmallVector<OpFoldResult> lbs, ubs, steps;
-  for (auto [loopRange, givenTileSize] :
-       llvm::zip_equal(loopRanges, givenTileSizes)) {
+  for (auto [loopRange, tileSize] : llvm::zip_equal(loopRanges, tileSizes)) {
     // No loop if the tile size is 0.
-    if (isZeroInteger(givenTileSize))
+    if (isZeroInteger(tileSize))
       continue;
     lbs.push_back(loopRange.offset);
     ubs.push_back(loopRange.size);
-    steps.push_back(givenTileSize);
+    steps.push_back(tileSize);
   }
   return {lbs, ubs, steps};
 }
 
-/// Typedef for function that allows returning additional yielded values during
+/// A function that allows returning additional yielded values during
 /// `yieldTiledValuesAndReplace`.
 /// - `ivs` induction variable for the loop.
 /// - `newBbArgs` basic block arguments corresponding to newly added iter_args.
@@ -339,30 +402,6 @@ using YieldTiledValuesFn = std::function<LogicalResult(
     SmallVector<SmallVector<OpFoldResult>> &resultOffsets,
     SmallVector<SmallVector<OpFoldResult>> &resultSizes)>;
 
-/// Typedef for function that implements the body of a tiled loop.
-/// - `ivs` induction variable for the loop.
-/// - `tileOffsets` represents offsets for the tiled iteration space.
-/// - `tileSizes` represents the sizes for the tiled iteraiton space.
-/// - `outerDestinationTensors` tensor that holds the result. Is same size
-///   as the destination operands of the original operations.
-/// - `tiledResults` results of the tiled computation, corresponds to
-///   tiles of the original operation computed by the loop body.
-///   Should be same size as the `destinationTensors`
-/// - `resultOffsets` is of the same size as `tiledResults` and represents
-///   the offset to use when writing the corresponding element from
-///   `tiledResults` into `destinationTensors`.
-/// - `resultOffsets` is of the same size as `tiledResults` and represents
-///   the size to use when writing the corresponding element from
-///   `tiledResults` into `destinationTensors`.
-/// In case the method needs to return `failure()` the method is expected
-/// to clean up any inserted operations.
-using GenerateTiledBodyFn = std::function<LogicalResult(
-    RewriterBase &rewriter, Location Loc, ValueRange ivs,
-    ArrayRef<OpFoldResult> tileOffsets, ArrayRef<OpFoldResult> tileSizes,
-    ValueRange outerDestinationTensors, SmallVector<Value> &tiledResults,
-    SmallVector<SmallVector<OpFoldResult>> &resultOffsets,
-    SmallVector<SmallVector<OpFoldResult>> &resultSizes)>;
-
 /// Clones the operation and updates the destination if the operation
 /// implements the `DestinationStyleOpInterface`.
 static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter,
@@ -378,25 +417,26 @@ static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter,
 
 /// Generate the tile-loop nest using `scf.for` operation.
 /// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
-/// - `givenTileSizes` is the tile sizes to use. Zero represent untiled loops.
-/// - `outerDestinationTensors` are the init values to use for the outer most
-/// loop.
-/// - `tiledBodyFn` is called to generated the loop body of the inner
+/// - `tileSizes` is the tile sizes to use. Zero represent untiled loops.
+/// - `destinationTensors` are the init values to use for the outer most loop.
+/// - `yieldTiledValuesFn` is called to generated the loop body of the inner
 /// most
 ///    loop.
-...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/159598