[Mlir-commits] [mlir] [MLIR] Add continuous tiling to TileUsingForOp (PR #82792)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Sat Feb 24 09:45:15 PST 2024
================
@@ -309,6 +311,188 @@ static LogicalResult generateLoopNest(RewriterBase &rewriter, Location loc,
return rewriter.notifyMatchFailure(loc, "unhandled loop type");
}
+static void continuousLoopNestHelper(
+ OpBuilder &builder, Location loc, ArrayRef<Range> loopRanges,
+ SmallVector<LoopLikeOpInterface> &loops, uint64_t loopLevelIdx,
+ uint64_t &loopIdx, ArrayRef<OpFoldResult> tileSizes,
+ SmallVector<bool> &CTileVector, std::map<int, OpFoldResult> &sizesMap,
+ SmallVector<scf::ForOp> &innermostLoops, ValueRange destinationTensors = {},
+ bool isHeadOrInsideHeadLoop = false) {
+
+ Value offset = getValueOrCreateConstantIndexOp(
+ builder, loc, loopRanges[loopLevelIdx].offset);
+ Value size = getValueOrCreateConstantIndexOp(builder, loc,
+ loopRanges[loopLevelIdx].size);
+ Value tileSize =
+ getValueOrCreateConstantIndexOp(builder, loc, tileSizes[loopLevelIdx]);
+
+ AffineExpr sym0, sym1, sym2;
+ bindSymbols(builder.getContext(), sym0, sym1, sym2);
+ AffineMap defaultSplitMap =
+ AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
+ // Simplified map for use when step is power of 2 and lower bound
+ // is exactly divisble by step.
+ AffineMap powerSplitMap = AffineMap::get(0, 3, {sym1 - (sym1 % sym2)});
+
+ uint64_t tileSizeInt = *getConstantIntValue(tileSize);
+
+ // Enforce no tiling when tile size is zero.
+ // No need to create a loop here.
+ if (tileSizeInt == 0) {
+ continuousLoopNestHelper(builder, loc, loopRanges, loops, loopLevelIdx + 1,
+ loopIdx, tileSizes, CTileVector, sizesMap,
+ innermostLoops, destinationTensors,
+ isHeadOrInsideHeadLoop);
+ return;
+ }
+
+ // The head loop is always tiled using the tile size specified
+ // in the size parameters to tile_using_for transform.
+ auto loop = builder.create<scf::ForOp>(
+ loc, offset, size, tileSize, destinationTensors,
+ [&](OpBuilder &bodyBuilder, Location bodyLoc, Value iv,
+ ValueRange /*iterArgs*/) {
+ sizesMap[loopIdx] =
+ getBoundedTileSize(bodyBuilder, bodyLoc, loopRanges[loopLevelIdx],
+ iv, getAsOpFoldResult(tileSize));
+ });
+
+ loop->setAttr(kLoopIndexLabel, builder.getIndexAttr(loopIdx));
+ ++loopIdx;
+
+ scf::ForOp currentLoop = loop;
+ auto lbInt = getConstantIntValue(currentLoop.getLowerBound());
+ // Use simplified powerSplitMap instead of the default when possible.
+ bool usePowerSplit = (lbInt.has_value()) &&
+ (*lbInt % tileSizeInt == static_cast<int64_t>(0)) &&
+ (tileSizeInt == llvm::bit_floor(tileSizeInt));
+
+ AffineMap splitMap = usePowerSplit ? powerSplitMap : defaultSplitMap;
+
+ bool isInnermostLoop = loopLevelIdx == loopRanges.size() - 1;
+ if (isInnermostLoop)
+ innermostLoops.push_back(currentLoop);
+
+ if (isHeadOrInsideHeadLoop)
+ loops.push_back(loop);
+
+ builder.setInsertionPointToEnd(loop.getBody());
+
+ // Create the nested loop inside current loop.
+ if (!isInnermostLoop)
+ continuousLoopNestHelper(builder, loop->getLoc(), loopRanges, loops,
+ loopLevelIdx + 1, loopIdx, tileSizes, CTileVector,
+ sizesMap, innermostLoops, loop.getRegionIterArgs(),
+ isHeadOrInsideHeadLoop);
+
+ // Apply continuous tiling to current loop if continuous_tiles
+ // specifies so.
+ while (CTileVector[loopLevelIdx] && tileSizeInt > 1) {
+
+ uint64_t maxPower = llvm::bit_floor(tileSizeInt);
+ tileSizeInt = maxPower == tileSizeInt ? maxPower >> 1 : maxPower;
+
+ builder.setInsertionPoint(currentLoop);
+
+ auto constStepOp = builder.create<arith::ConstantIndexOp>(loc, tileSizeInt);
+
+ Value splitBound = builder.createOrFold<affine::AffineApplyOp>(
+ loc, splitMap,
+ ValueRange{currentLoop.getLowerBound(), currentLoop.getUpperBound(),
+ currentLoop.getStep()});
+
+ builder.setInsertionPointAfter(currentLoop);
+ auto additionalLoop =
+ builder.create<scf::ForOp>(currentLoop->getLoc(), splitBound, size,
+ constStepOp, destinationTensors);
+
+ additionalLoop.getInitArgsMutable().assign(currentLoop->getResults());
+ currentLoop.getUpperBoundMutable().assign(splitBound);
+
+ builder.setInsertionPointToStart(additionalLoop.getBody());
+ AffineExpr s0, s1, d0;
+ bindDims(builder.getContext(), d0);
+ bindSymbols(builder.getContext(), s0, s1);
+ AffineMap minMap = AffineMap::get(1, 1, {s0}, builder.getContext());
+ auto additionalLoopAffineMin = affine::makeComposedAffineMin(
+ builder, loc, minMap,
+ SmallVector<OpFoldResult>{splitBound, getAsOpFoldResult(constStepOp),
+ size});
+
+ currentLoop = additionalLoop;
+
+ sizesMap[loopIdx] = getAsOpFoldResult(additionalLoopAffineMin);
+
+ // Add custom loop-indexing attribute to each loop op.
+ // Continuous tiling ends up generating many loop nestings and
+ // each loop can be identified with its loop-index attribute.
+ // This is needed later to retrieve the sizes from sizesMap.
+ currentLoop->setAttr(kLoopIndexLabel, builder.getIndexAttr(loopIdx));
+
+ ++loopIdx;
+
+ if (isInnermostLoop)
+ innermostLoops.push_back(currentLoop);
+
+ builder.setInsertionPointToEnd(currentLoop.getBody());
+
+ // Create the nested loop inside current loop.
+ if (!isInnermostLoop)
+ continuousLoopNestHelper(builder, currentLoop->getLoc(), loopRanges,
+ loops, loopLevelIdx + 1, loopIdx, tileSizes,
+ CTileVector, sizesMap, innermostLoops,
+ currentLoop.getRegionIterArgs());
+ }
+
+ // Always yield the result of the tail-end loop as this
+ // will have all the processed tiles.
+ if (!isa<func::ReturnOp>(currentLoop->getBlock()->back())) {
+ builder.setInsertionPointToEnd(currentLoop->getBlock());
+ builder.create<scf::YieldOp>(currentLoop.getLoc(),
+ currentLoop.getResults());
+ }
+ /// For the outermost loop insert the tail-end loop in front of loops
+ /// structure so that it's results can be used for replacements in the
+ /// function return. This is removed from the head of loops later.
+ else
+ loops.insert(loops.begin(), currentLoop);
+
+ destinationTensors = loop.getRegionIterArgs();
+}
+
+/// Generate an empty loop nest that represents the continuous-tiled loop nest
+/// shell.
+/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
+/// - `tileSizes` is the tile sizes to use in the first tiling attempt. Zero
+/// represent untiled loops.
+/// - In ``sizesMap` return the multi-dimensional size of
+/// the tile processed within the inner most loop.
+/// - `CTileVector` specifies which loop nest should be continuously tiled.
+/// Note that this methods adds `scf.yield` operation for all but the innermost
+/// loop. These yield the value returned by the immediately inner tail-end loop.
+/// The caller is expected to add the scf.yield operation for all innermost
----------------
MaheshRavishankar wrote:
I have found this to be a bit of a foot gun. At some point of the implementation I did the same where the caller would apply the `scf.yield`. But that is awkward for the caller to do. Can we instead try to generate the `scf.yield` during the transformation itself?
https://github.com/llvm/llvm-project/pull/82792
More information about the Mlir-commits
mailing list