[Mlir-commits] [mlir] [MLIR] Add continuous tiling to TileUsingForOp (PR #82792)

llvmlistbot at llvm.org llvmlistbot at llvm.org
Fri Feb 23 08:58:22 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-mlir

Author: None (muneebkhan85)

<details>
<summary>Changes</summary>

This patch adds continuous tiling options to TileUsingForOp. In cases where the tensor dimensions are not exactly divisible by the tile size, we are left with leftover tensor chunks that are irregularly tiled. This approach attempts to tile the leftover chunk with a smaller tile size and repeats this process recursively using exponentially diminishing tile sizes. The transform eventually generates a chain of loops that apply tiling using diminishing tile sizes. The transform lowers from the linalg dialect to scf dialect.

---

Patch is 62.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/82792.diff


6 Files Affected:

- (modified) mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td (+8-1) 
- (modified) mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h (+14) 
- (modified) mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp (+32-8) 
- (modified) mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp (+457) 
- (modified) mlir/python/mlir/dialects/transform/structured.py (+4) 
- (added) mlir/test/Dialect/Linalg/continuous-tiling.mlir (+390) 


``````````diff
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 309573a562872f..4976c8e82db4df 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1833,7 +1833,9 @@ def TileUsingForOp : Op<Transform_Dialect, "structured.tile_using_for",
     be as many handles as `ShapedType::kDynamic` values in the
     `static_sizes` attribute. A static size of `0` indicates that the dimension
     should not be tiled. No loop will be generated for such dimensions. If all
-    tile sizes are `0`, this transform is effectively a no-op.
+    tile sizes are `0`, this transform is effectively a no-op. To apply
+    continuous tiling `continuous_tiles` needs to be supplied with as many
+    boolean values as there are nested loops.
 
     This op returns handles to the tiled op (in the generated loop nest) and the
     generated loops. The number of loops is the number of tile sizes that are
@@ -1859,6 +1861,7 @@ def TileUsingForOp : Op<Transform_Dialect, "structured.tile_using_for",
   let arguments = (ins TransformHandleTypeInterface:$target,
                    Variadic<TransformAnyParamTypeOrAnyHandle>:$dynamic_sizes,
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sizes,
+                   DefaultValuedOptionalAttr<DenseBoolArrayAttr, "{}">:$continuous_tiles,
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$interchange,
                    DefaultValuedOptionalAttr<DenseBoolArrayAttr, "{}">:$scalable_sizes);
   let results = (outs TransformHandleTypeInterface:$tiled_linalg_op,
@@ -1867,22 +1870,26 @@ def TileUsingForOp : Op<Transform_Dialect, "structured.tile_using_for",
     OpBuilder<(ins "TypeRange":$loopTypes,
                    "Value":$target,
                    "ArrayRef<int64_t>":$staticTileSizes,
+                   CArg<"ArrayRef<bool>", "{}">:$continuousTiles,
                    CArg<"ArrayRef<int64_t>", "{}">:$interchange,
                    CArg<"std::optional<ArrayRef<bool>>", "std::nullopt">:
                       $scalableSizes)>,
     OpBuilder<(ins "TypeRange":$loopTypes,
                    "Value":$target,
                    "ArrayRef<OpFoldResult>":$mixedTileSizes,
+                   CArg<"ArrayRef<bool>", "{}">:$continuousTiles,
                    CArg<"ArrayRef<int64_t>", "{}">:$interchange,
                    CArg<"std::optional<ArrayRef<bool>>", "std::nullopt">:
                       $scalableSizes)>,
     OpBuilder<(ins "Value":$target,
                    "ArrayRef<int64_t>":$staticTileSizes,
+                   CArg<"ArrayRef<bool>", "{}">:$continuousTiles,
                    CArg<"ArrayRef<int64_t>", "{}">:$interchange,
                    CArg<"std::optional<ArrayRef<bool>>", "std::nullopt">:
                       $scalableSizes)>,
     OpBuilder<(ins "Value":$target,
                    "ArrayRef<OpFoldResult>":$mixedTileSizes,
+                   CArg<"ArrayRef<bool>", "{}">:$continuousTiles,
                    CArg<"ArrayRef<int64_t>", "{}">:$interchange,
                    CArg<"std::optional<ArrayRef<bool>>", "std::nullopt">:
                       $scalableSizes)>,
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index 965ef9e203be28..b40291b5a80da5 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -71,6 +71,14 @@ struct SCFTilingOptions {
         mapping, [](auto attr) -> Attribute { return attr; });
     return *this;
   }
+
+  /// Specify which loops in the loop nest are to be continuously tiled.
+  SmallVector<bool> continuousTileMappingVector = {};
+  SCFTilingOptions &setCTileMapping(ArrayRef<bool> ctile) {
+    continuousTileMappingVector =
+        llvm::map_to_vector(ctile, [](auto attr) -> bool { return attr; });
+    return *this;
+  }
 };
 
 /// Transformation information returned after tiling.
@@ -92,6 +100,12 @@ FailureOr<SCFTilingResult> tileUsingSCF(RewriterBase &rewriter,
                                         TilingInterface op,
                                         const SCFTilingOptions &options);
 
+/// Method to continuously tile an op that implements the `TilingInterface`
+/// using `scf.for` for iterating over the tiles.
+FailureOr<SCFTilingResult>
+continuousTileUsingSCF(RewriterBase &rewriter, TilingInterface op,
+                            const SCFTilingOptions &options);
+
 /// Options used to control tile + fuse.
 struct SCFTileAndFuseOptions {
   /// The tiling options used to control the tiling of the consumer.
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 299965bcfc3ab3..75080ea7721beb 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2476,39 +2476,41 @@ DiagnosedSilenceableFailure transform::TileReductionUsingForallOp::applyToOne(
 void transform::TileUsingForOp::build(
     OpBuilder &builder, OperationState &result, TypeRange loopTypes,
     Value target, ArrayRef<int64_t> staticTileSizes,
-    ArrayRef<int64_t> interchange,
+    ArrayRef<bool> continuousTiles, ArrayRef<int64_t> interchange,
     std::optional<ArrayRef<bool>> scalableSizes) {
   return build(builder, result, loopTypes,
                /*target=*/target,
                /*mixedTileSizes=*/
                getAsOpFoldResult(builder.getI64ArrayAttr(staticTileSizes)),
-               interchange, scalableSizes);
+               continuousTiles, interchange, scalableSizes);
 }
 
 void transform::TileUsingForOp::build(
     OpBuilder &builder, OperationState &result, Value target,
-    ArrayRef<int64_t> staticTileSizes, ArrayRef<int64_t> interchange,
+    ArrayRef<int64_t> staticTileSizes, ArrayRef<bool> continuousTiles,
+    ArrayRef<int64_t> interchange,
     std::optional<ArrayRef<bool>> scalableSizes) {
   build(builder, result, target,
         getAsOpFoldResult(builder.getI64ArrayAttr(staticTileSizes)),
-        interchange, scalableSizes);
+        builder.getDenseBoolArrayAttr(continuousTiles), interchange, scalableSizes);
 }
 
 void transform::TileUsingForOp::build(
     OpBuilder &builder, OperationState &result, Value target,
-    ArrayRef<OpFoldResult> mixedTileSizes, ArrayRef<int64_t> interchange,
+    ArrayRef<OpFoldResult> mixedTileSizes, ArrayRef<bool> continuousTiles,
+    ArrayRef<int64_t> interchange,
     std::optional<ArrayRef<bool>> scalableSizes) {
   // Loop types are automaticaly splat by the callee, setting up one is
   // enough.
   SmallVector<Type> loopTypes(1, builder.getType<transform::AnyOpType>());
-  build(builder, result, loopTypes, target, mixedTileSizes, interchange,
+  build(builder, result, loopTypes, target, mixedTileSizes, continuousTiles, interchange,
         scalableSizes);
 }
 
 void transform::TileUsingForOp::build(
     OpBuilder &builder, OperationState &result, TypeRange loopTypes,
     Value target, ArrayRef<OpFoldResult> mixedTileSizes,
-    ArrayRef<int64_t> interchange,
+    ArrayRef<bool> continuousTiles, ArrayRef<int64_t> interchange,
     std::optional<ArrayRef<bool>> scalableSizes) {
   SmallVector<int64_t> staticTileSizes;
   SmallVector<Value> dynamicTileSizes;
@@ -2517,6 +2519,7 @@ void transform::TileUsingForOp::build(
   // attributes for multiple variadic operands. In the absence of this,
   // horrible bugs ensue.
   auto staticTileSizesAttr = builder.getDenseI64ArrayAttr(staticTileSizes);
+  auto continuousTilesAttr = builder.getDenseBoolArrayAttr(continuousTiles);
   unsigned numExpectedLoops =
       staticTileSizes.size() - llvm::count(staticTileSizes, 0);
   SmallVector<Type> resultTypes;
@@ -2535,6 +2538,7 @@ void transform::TileUsingForOp::build(
         /*target=*/target,
         /*dynamic_sizes=*/dynamicTileSizes,
         /*static_sizes=*/staticTileSizesAttr,
+        /*continuous_tiles=*/continuousTilesAttr,
         /*interchange=*/builder.getDenseI64ArrayAttr(interchange),
         /*scalable_sizes=*/expandedScalableSizes);
 }
@@ -2675,8 +2679,15 @@ transform::TileUsingForOp::apply(transform::TransformRewriter &rewriter,
     }
 
     tilingOptions.setInterchange(getInterchange());
-    FailureOr<scf::SCFTilingResult> maybeTilingResult =
+    tilingOptions.setCTileMapping(getContinuousTiles());
+
+    FailureOr<scf::SCFTilingResult> maybeTilingResult;
+    if (tilingOptions.continuousTileMappingVector.empty())
+      maybeTilingResult =
         tileUsingSCF(rewriter, tilingInterface, tilingOptions);
+    else
+      maybeTilingResult =
+          continuousTileUsingSCF(rewriter, tilingInterface, tilingOptions);
     if (failed(maybeTilingResult))
       return DiagnosedSilenceableFailure::definiteFailure();
 
@@ -2725,6 +2736,18 @@ ParseResult parseOptionalInterchange(OpAsmParser &parser,
   return success();
 }
 
+ParseResult parseOptionalContinuousTiles(OpAsmParser &parser,
+                                         OperationState &result) {
+  if (failed(parser.parseOptionalKeyword("continuous_tiles")))
+    return success();
+  if (failed(parser.parseEqual()))
+    return failure();
+  result.addAttribute(
+      transform::TileUsingForOp::getContinuousTilesAttrName(result.name),
+      DenseBoolArrayAttr::parse(parser, Type{}));
+  return success();
+}
+
 void printOptionalInterchange(OpAsmPrinter &p,
                               ArrayRef<int64_t> interchangeVals) {
   if (!interchangeVals.empty()) {
@@ -2747,6 +2770,7 @@ ParseResult transform::TileUsingForOp::parse(OpAsmParser &parser,
   if (parser.parseOperand(target) || parser.getCurrentLocation(&operandLoc) ||
       parseDynamicIndexList(parser, dynamicSizes, staticSizes, scalableVals) ||
       parseOptionalInterchange(parser, result) ||
+      parseOptionalContinuousTiles(parser, result) ||
       parser.parseOptionalAttrDict(result.attributes) ||
       parser.parseColonType(functionalType))
     return ParseResult::failure();
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 1a84a59ddb69df..f81a901c82db99 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -31,6 +31,8 @@
 
 using namespace mlir;
 
+static constexpr char kLoopIndexLabel[] = "__loop_index__";
+
 scf::SCFTilingOptions &
 scf::SCFTilingOptions::setTileSizes(ArrayRef<OpFoldResult> ts) {
   assert(!tileSizeComputationFunction && "tile sizes already set");
@@ -309,6 +311,189 @@ static LogicalResult generateLoopNest(RewriterBase &rewriter, Location loc,
   return rewriter.notifyMatchFailure(loc, "unhandled loop type");
 }
 
+static void continuousLoopNestHelper(
+    OpBuilder &builder, Location loc, ArrayRef<Range> loopRanges,
+    SmallVector<LoopLikeOpInterface> &loops, uint64_t loopLevelIdx, uint64_t &loopIdx,
+    ArrayRef<OpFoldResult> tileSizes, SmallVector<bool> &CTileVector,
+    std::map<int, OpFoldResult> &sizesMap,
+    SmallVector<scf::ForOp> &innermostLoops, ValueRange destinationTensors = {},
+    bool isHeadOrInsideHeadLoop = false) {
+
+  Value offset = getValueOrCreateConstantIndexOp(
+      builder, loc, loopRanges[loopLevelIdx].offset);
+  Value size = getValueOrCreateConstantIndexOp(builder, loc,
+                                               loopRanges[loopLevelIdx].size);
+  Value tileSize =
+      getValueOrCreateConstantIndexOp(builder, loc, tileSizes[loopLevelIdx]);
+
+  AffineExpr sym0, sym1, sym2;
+  bindSymbols(builder.getContext(), sym0, sym1, sym2);
+  AffineMap defaultSplitMap =
+      AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
+  // Simplified map for use when step is power of 2 and lower bound
+  // is exactly divisble by step.
+  AffineMap powerSplitMap = AffineMap::get(0, 3, {sym1 - (sym1 % sym2)});
+
+  uint64_t tileSizeInt = *getConstantIntValue(tileSize);
+
+  // Enforce no tiling when tile size is zero.
+  // No need to create a loop here.
+  if (tileSizeInt == 0) {
+    continuousLoopNestHelper(builder, loc, loopRanges, loops, loopLevelIdx + 1,
+                             loopIdx, tileSizes, CTileVector, sizesMap,
+                             innermostLoops, destinationTensors,
+                             isHeadOrInsideHeadLoop);
+    return;
+  }
+
+  // The head loop is always tiled using the tile size specified
+  // in the size parameters to tile_using_for transform.
+  auto loop = builder.create<scf::ForOp>(
+      loc, offset, size, tileSize, destinationTensors,
+      [&](OpBuilder &bodyBuilder, Location bodyLoc, Value iv,
+          ValueRange /*iterArgs*/) {
+        sizesMap[loopIdx] =
+            getBoundedTileSize(bodyBuilder, bodyLoc, loopRanges[loopLevelIdx],
+                               iv, getAsOpFoldResult(tileSize));
+      });
+
+  loop->setAttr(kLoopIndexLabel, builder.getIndexAttr(loopIdx));
+  ++loopIdx;
+
+  scf::ForOp currentLoop = loop;
+  auto lbInt = getConstantIntValue(currentLoop.getLowerBound());
+  // Use simplified powerSplitMap instead of the default when possible.
+  bool usePowerSplit = (lbInt.has_value()) &&
+                       (*lbInt % tileSizeInt == static_cast<int64_t>(0)) &&
+                       (tileSizeInt == llvm::bit_floor(tileSizeInt));
+
+  AffineMap splitMap = usePowerSplit ? powerSplitMap : defaultSplitMap;
+
+  bool isInnermostLoop = loopLevelIdx == loopRanges.size() - 1;
+  if (isInnermostLoop)
+    innermostLoops.push_back(currentLoop);
+
+  if (isHeadOrInsideHeadLoop)
+    loops.push_back(loop);
+
+  builder.setInsertionPointToEnd(loop.getBody());
+
+  // Create the nested loop inside current loop.
+  if (!isInnermostLoop)
+    continuousLoopNestHelper(builder, loop->getLoc(), loopRanges, loops,
+                             loopLevelIdx + 1, loopIdx, tileSizes, CTileVector,
+                             sizesMap, innermostLoops, loop.getRegionIterArgs(),
+                             isHeadOrInsideHeadLoop);
+
+  // Apply continuous tiling to current loop if continuous_tiles
+  // specifies so.
+  while (CTileVector[loopLevelIdx] && tileSizeInt > 1) {
+
+    uint64_t maxPower = llvm::bit_floor(tileSizeInt);
+    tileSizeInt = maxPower == tileSizeInt ? maxPower >> 1 : maxPower;
+
+    builder.setInsertionPoint(currentLoop);
+
+    auto constStepOp = builder.create<arith::ConstantIndexOp>(loc, tileSizeInt);
+
+    Value splitBound = builder.createOrFold<affine::AffineApplyOp>(
+        loc, splitMap,
+        ValueRange{currentLoop.getLowerBound(), currentLoop.getUpperBound(),
+                   currentLoop.getStep()});
+
+    builder.setInsertionPointAfter(currentLoop);
+    auto additionalLoop =
+        builder.create<scf::ForOp>(currentLoop->getLoc(), splitBound, size,
+                                   constStepOp, destinationTensors);
+
+    additionalLoop.getInitArgsMutable().assign(currentLoop->getResults());
+    currentLoop.getUpperBoundMutable().assign(splitBound);
+
+    builder.setInsertionPointToStart(additionalLoop.getBody());
+    AffineExpr s0, s1, d0;
+    bindDims(builder.getContext(), d0);
+    bindSymbols(builder.getContext(), s0, s1);
+    AffineMap minMap = AffineMap::get(1, 1, {s0}, builder.getContext());
+    auto additionalLoopAffineMin = affine::makeComposedAffineMin(
+        builder, loc, minMap,
+        SmallVector<OpFoldResult>{splitBound, getAsOpFoldResult(constStepOp),
+                                  size});
+
+    currentLoop = additionalLoop;
+
+    sizesMap[loopIdx] = getAsOpFoldResult(additionalLoopAffineMin);
+
+    // Add custom loop-indexing attribute to each loop op.
+    // Continuous tiling ends up generating many loop nestings and
+    // each loop can be identified with its loop-index attribute.
+    // This is needed later to retrieve the sizes from sizesMap.
+    currentLoop->setAttr(kLoopIndexLabel, builder.getIndexAttr(loopIdx));
+
+    ++loopIdx;
+
+    if (isInnermostLoop)
+      innermostLoops.push_back(currentLoop);
+
+    builder.setInsertionPointToEnd(currentLoop.getBody());
+
+    // Create the nested loop inside current loop.
+    if (!isInnermostLoop)
+      continuousLoopNestHelper(builder, currentLoop->getLoc(), loopRanges,
+                               loops, loopLevelIdx + 1, loopIdx, tileSizes,
+                               CTileVector, sizesMap, innermostLoops,
+                               currentLoop.getRegionIterArgs());
+  }
+
+  // Always yield the result of the tail-end loop as this
+  // will have all the processed tiles.
+  if (!isa<func::ReturnOp>(currentLoop->getBlock()->back())) {
+    builder.setInsertionPointToEnd(currentLoop->getBlock());
+    builder.create<scf::YieldOp>(currentLoop.getLoc(),
+                                 currentLoop.getResults());
+  }
+  /// For the outermost loop insert the tail-end loop in front of loops
+  /// structure so that it's results can be used for replacements in the
+  /// function return. This is removed from the head of loops later.
+  else
+    loops.insert(loops.begin(), currentLoop);
+
+  destinationTensors = loop.getRegionIterArgs();
+}
+
+/// Generate an empty loop nest that represents the continuous-tiled loop nest
+/// shell.
+/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
+/// - `tileSizes` is the tile sizes to use in the first tiling attempt. Zero
+/// represent untiled loops.
+/// - In ``sizesMap` return the multi-dimensional size of
+///   the tile processed within the inner most loop.
+/// - `CTileVector` specifies which loop nest should be continuously tiled.
+/// Note that this methods adds `scf.yield` operation for all but the innermost
+/// loop. These yield the value returned by the immediately inner tail-end loop.
+/// The caller is expected to add the scf.yield operation for all innermost
+/// loops.
+static SmallVector<LoopLikeOpInterface> generateContinuousTileLoopNest(
+    OpBuilder &builder, Location loc, ArrayRef<Range> loopRanges,
+    ArrayRef<OpFoldResult> tileSizes, SmallVector<bool> CTileVector,
+    std::map<int, OpFoldResult> &sizesMap,
+    SmallVector<scf::ForOp> &innermostLoops,
+    ValueRange destinationTensors = {}) {
+  if (loopRanges.empty())
+    return {};
+  assert(loopRanges.size() == tileSizes.size() &&
+         "expected as many tile sizes as loop ranges");
+  OpBuilder::InsertionGuard guard(builder);
+  SmallVector<LoopLikeOpInterface> loops;
+
+  uint64_t loopIdx = 0;
+  continuousLoopNestHelper(builder, loc, loopRanges, loops, 0, loopIdx,
+                           tileSizes, CTileVector, sizesMap, innermostLoops,
+                           destinationTensors, true);
+
+  return loops;
+}
+
+
 /// Append the specified additional `newInitOperands` operands to the
 /// loops existing `init` operands (or similar), and replace `loopOp` with
 /// the new loop that has the additional init operands. The loop body of
@@ -679,6 +864,278 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
   return scf::SCFTilingResult{tilingResult->tiledOps, loops, replacements};
 }
 
+/// Implementation of continuous-tiling transformation of `op`
+/// that implements the `TilingInterface` using a sequence of
+/// `scf.for` loops to iterate over tiles of exponentially
+/// diminishing sizes.
+///
+/// The generated sequence of `scf.for` loops iterate over tiles of
+/// exponentially diminishing sizes as opposed to vanilla tiling scheme where
+/// only the tile sizes specified as transform parameters are used.
+/// continuous-tiling first applies regular tiling using the specified
+/// tile size, then halves the tile size and uses it to tile the leftover
+/// chunk. This process of halving the tile size and tiling the leftover
+/// chunk is repeated until tile size reaches 1. The transform parameter
+/// continuous_tiles controls which nested loop should be tiled. If all
+/// arguments in continuous_tiles are set to false, the result is identical
+/// to vanilla tiling transform.
+///
+/// When tiling a tensor of size M with tile size 8, it generates
+/// the following loop to tile
+///
+/// for (int i = 0; i < M; i += 8) {
+///   int size = min(8, M-i);
+///   // size is unknown at compile time because M is dynamic
+///   // compute using dynami...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/82792


More information about the Mlir-commits mailing list