[Mlir-commits] [mlir] [MLIR] Add continuous tiling to TileUsingForOp (PR #82792)

llvmlistbot at llvm.org llvmlistbot at llvm.org
Tue Feb 27 03:13:49 PST 2024


================
@@ -309,6 +311,188 @@ static LogicalResult generateLoopNest(RewriterBase &rewriter, Location loc,
   return rewriter.notifyMatchFailure(loc, "unhandled loop type");
 }
 
+static void continuousLoopNestHelper(
+    OpBuilder &builder, Location loc, ArrayRef<Range> loopRanges,
+    SmallVector<LoopLikeOpInterface> &loops, uint64_t loopLevelIdx,
+    uint64_t &loopIdx, ArrayRef<OpFoldResult> tileSizes,
+    SmallVector<bool> &CTileVector, std::map<int, OpFoldResult> &sizesMap,
+    SmallVector<scf::ForOp> &innermostLoops, ValueRange destinationTensors = {},
+    bool isHeadOrInsideHeadLoop = false) {
+
+  Value offset = getValueOrCreateConstantIndexOp(
+      builder, loc, loopRanges[loopLevelIdx].offset);
+  Value size = getValueOrCreateConstantIndexOp(builder, loc,
+                                               loopRanges[loopLevelIdx].size);
+  Value tileSize =
+      getValueOrCreateConstantIndexOp(builder, loc, tileSizes[loopLevelIdx]);
+
+  AffineExpr sym0, sym1, sym2;
+  bindSymbols(builder.getContext(), sym0, sym1, sym2);
+  AffineMap defaultSplitMap =
+      AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
+  // Simplified map for use when step is power of 2 and lower bound
+  // is exactly divisble by step.
+  AffineMap powerSplitMap = AffineMap::get(0, 3, {sym1 - (sym1 % sym2)});
+
+  uint64_t tileSizeInt = *getConstantIntValue(tileSize);
+
+  // Enforce no tiling when tile size is zero.
+  // No need to create a loop here.
+  if (tileSizeInt == 0) {
+    continuousLoopNestHelper(builder, loc, loopRanges, loops, loopLevelIdx + 1,
+                             loopIdx, tileSizes, CTileVector, sizesMap,
+                             innermostLoops, destinationTensors,
+                             isHeadOrInsideHeadLoop);
+    return;
+  }
+
+  // The head loop is always tiled using the tile size specified
+  // in the size parameters to tile_using_for transform.
+  auto loop = builder.create<scf::ForOp>(
+      loc, offset, size, tileSize, destinationTensors,
+      [&](OpBuilder &bodyBuilder, Location bodyLoc, Value iv,
+          ValueRange /*iterArgs*/) {
+        sizesMap[loopIdx] =
+            getBoundedTileSize(bodyBuilder, bodyLoc, loopRanges[loopLevelIdx],
+                               iv, getAsOpFoldResult(tileSize));
+      });
+
+  loop->setAttr(kLoopIndexLabel, builder.getIndexAttr(loopIdx));
+  ++loopIdx;
+
+  scf::ForOp currentLoop = loop;
+  auto lbInt = getConstantIntValue(currentLoop.getLowerBound());
+  // Use simplified powerSplitMap instead of the default when possible.
+  bool usePowerSplit = (lbInt.has_value()) &&
+                       (*lbInt % tileSizeInt == static_cast<int64_t>(0)) &&
+                       (tileSizeInt == llvm::bit_floor(tileSizeInt));
+
+  AffineMap splitMap = usePowerSplit ? powerSplitMap : defaultSplitMap;
+
+  bool isInnermostLoop = loopLevelIdx == loopRanges.size() - 1;
+  if (isInnermostLoop)
+    innermostLoops.push_back(currentLoop);
+
+  if (isHeadOrInsideHeadLoop)
+    loops.push_back(loop);
+
+  builder.setInsertionPointToEnd(loop.getBody());
+
+  // Create the nested loop inside current loop.
+  if (!isInnermostLoop)
+    continuousLoopNestHelper(builder, loop->getLoc(), loopRanges, loops,
+                             loopLevelIdx + 1, loopIdx, tileSizes, CTileVector,
+                             sizesMap, innermostLoops, loop.getRegionIterArgs(),
+                             isHeadOrInsideHeadLoop);
+
+  // Apply continuous tiling to current loop if continuous_tiles
+  // specifies so.
+  while (CTileVector[loopLevelIdx] && tileSizeInt > 1) {
+
+    uint64_t maxPower = llvm::bit_floor(tileSizeInt);
+    tileSizeInt = maxPower == tileSizeInt ? maxPower >> 1 : maxPower;
+
+    builder.setInsertionPoint(currentLoop);
+
+    auto constStepOp = builder.create<arith::ConstantIndexOp>(loc, tileSizeInt);
+
+    Value splitBound = builder.createOrFold<affine::AffineApplyOp>(
+        loc, splitMap,
+        ValueRange{currentLoop.getLowerBound(), currentLoop.getUpperBound(),
+                   currentLoop.getStep()});
+
+    builder.setInsertionPointAfter(currentLoop);
+    auto additionalLoop =
+        builder.create<scf::ForOp>(currentLoop->getLoc(), splitBound, size,
+                                   constStepOp, destinationTensors);
+
+    additionalLoop.getInitArgsMutable().assign(currentLoop->getResults());
+    currentLoop.getUpperBoundMutable().assign(splitBound);
+
+    builder.setInsertionPointToStart(additionalLoop.getBody());
+    AffineExpr s0, s1, d0;
+    bindDims(builder.getContext(), d0);
+    bindSymbols(builder.getContext(), s0, s1);
+    AffineMap minMap = AffineMap::get(1, 1, {s0}, builder.getContext());
+    auto additionalLoopAffineMin = affine::makeComposedAffineMin(
+        builder, loc, minMap,
+        SmallVector<OpFoldResult>{splitBound, getAsOpFoldResult(constStepOp),
+                                  size});
+
+    currentLoop = additionalLoop;
+
+    sizesMap[loopIdx] = getAsOpFoldResult(additionalLoopAffineMin);
+
+    // Add custom loop-indexing attribute to each loop op.
+    // Continuous tiling ends up generating many loop nestings and
+    // each loop can be identified with its loop-index attribute.
+    // This is needed later to retrieve the sizes from sizesMap.
+    currentLoop->setAttr(kLoopIndexLabel, builder.getIndexAttr(loopIdx));
+
+    ++loopIdx;
+
+    if (isInnermostLoop)
+      innermostLoops.push_back(currentLoop);
+
+    builder.setInsertionPointToEnd(currentLoop.getBody());
+
+    // Create the nested loop inside current loop.
+    if (!isInnermostLoop)
+      continuousLoopNestHelper(builder, currentLoop->getLoc(), loopRanges,
+                               loops, loopLevelIdx + 1, loopIdx, tileSizes,
+                               CTileVector, sizesMap, innermostLoops,
+                               currentLoop.getRegionIterArgs());
+  }
+
+  // Always yield the result of the tail-end loop as this
+  // will have all the processed tiles.
+  if (!isa<func::ReturnOp>(currentLoop->getBlock()->back())) {
+    builder.setInsertionPointToEnd(currentLoop->getBlock());
+    builder.create<scf::YieldOp>(currentLoop.getLoc(),
+                                 currentLoop.getResults());
+  }
+  /// For the outermost loop insert the tail-end loop in front of loops
+  /// structure so that it's results can be used for replacements in the
+  /// function return. This is removed from the head of loops later.
+  else
+    loops.insert(loops.begin(), currentLoop);
+
+  destinationTensors = loop.getRegionIterArgs();
+}
+
+/// Generate an empty loop nest that represents the continuous-tiled loop nest
+/// shell.
+/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
+/// - `tileSizes` is the tile sizes to use in the first tiling attempt. Zero
+/// represent untiled loops.
+/// - In ``sizesMap` return the multi-dimensional size of
+///   the tile processed within the inner most loop.
+/// - `CTileVector` specifies which loop nest should be continuously tiled.
+/// Note that this methods adds `scf.yield` operation for all but the innermost
+/// loop. These yield the value returned by the immediately inner tail-end loop.
+/// The caller is expected to add the scf.yield operation for all innermost
----------------
muneebkhan85 wrote:

Sorry, but this was a leftover from the previous "copied" comment. The scf.yield is actually generated for all inner loops as part of the call itself after their creation (as you suggested). 

https://github.com/llvm/llvm-project/pull/82792


More information about the Mlir-commits mailing list