[Mlir-commits] [mlir] [mlir][scf] Add reductions support to `scf.parallel` fusion (PR #75955)
Renato Golin
llvmlistbot at llvm.org
Fri Jan 5 04:30:47 PST 2024
================
@@ -131,29 +131,83 @@ static bool isFusionLegal(ParallelOp firstPloop, ParallelOp secondPloop,
}
/// Prepends operations of firstPloop's body into secondPloop's body.
-static void fuseIfLegal(ParallelOp firstPloop, ParallelOp secondPloop,
- OpBuilder b,
+/// Updates secondPloop with new loop.
+static void fuseIfLegal(ParallelOp firstPloop, ParallelOp &secondPloop,
+ OpBuilder builder,
llvm::function_ref<bool(Value, Value)> mayAlias) {
+ Block *block1 = firstPloop.getBody();
+ Block *block2 = secondPloop.getBody();
IRMapping firstToSecondPloopIndices;
- firstToSecondPloopIndices.map(firstPloop.getBody()->getArguments(),
- secondPloop.getBody()->getArguments());
+ firstToSecondPloopIndices.map(block1->getArguments(), block2->getArguments());
if (!isFusionLegal(firstPloop, secondPloop, firstToSecondPloopIndices,
mayAlias))
return;
- b.setInsertionPointToStart(secondPloop.getBody());
- for (auto &op : firstPloop.getBody()->without_terminator())
- b.clone(op, firstToSecondPloopIndices);
+ DominanceInfo dom;
+ for (Operation *user : firstPloop->getUsers())
+ if (!dom.properlyDominates(secondPloop, user, /*enclosingOpOk*/ false))
+ return;
+
+ ValueRange inits1 = firstPloop.getInitVals();
+ ValueRange inits2 = secondPloop.getInitVals();
+
+ SmallVector<Value> newInitVars(inits1.begin(), inits1.end());
+ newInitVars.append(inits2.begin(), inits2.end());
+
+ IRRewriter b(builder);
+ b.setInsertionPoint(secondPloop);
+ auto newSecondPloop = b.create<ParallelOp>(
+ secondPloop.getLoc(), secondPloop.getLowerBound(),
+ secondPloop.getUpperBound(), secondPloop.getStep(), newInitVars);
+
+ Block *newBlock = newSecondPloop.getBody();
+ auto term1 = cast<ReduceOp>(block1->getTerminator());
+ auto term2 = cast<ReduceOp>(block2->getTerminator());
+
+ b.inlineBlockBefore(block2, newBlock, newBlock->begin(),
+ newBlock->getArguments());
+ b.inlineBlockBefore(block1, newBlock, newBlock->begin(),
+ newBlock->getArguments());
+
+ ValueRange results = newSecondPloop.getResults();
+ if (!results.empty()) {
+ b.setInsertionPointToEnd(newBlock);
+
+ ValueRange reduceArgs1 = term1.getOperands();
+ ValueRange reduceArgs2 = term2.getOperands();
+ SmallVector<Value> newReduceArgs(reduceArgs1.begin(), reduceArgs1.end());
+ newReduceArgs.append(reduceArgs2.begin(), reduceArgs2.end());
+
+ auto newReduceOp = b.create<scf::ReduceOp>(term2.getLoc(), newReduceArgs);
+
+ for (auto &&[i, reg] : llvm::enumerate(llvm::concat<Region>(
+ term1.getReductions(), term2.getReductions()))) {
+ Block &oldRedBlock = reg.front();
+ Block &newRedBlock = newReduceOp.getReductions()[i].front();
+ b.inlineBlockBefore(&oldRedBlock, &newRedBlock, newRedBlock.begin(),
+ newRedBlock.getArguments());
+ }
+
+ firstPloop.replaceAllUsesWith(results.take_front(inits1.size()));
+ secondPloop.replaceAllUsesWith(results.take_back(inits2.size()));
+ }
+ term1->erase();
+ term2->erase();
firstPloop.erase();
+ secondPloop.erase();
+ secondPloop = newSecondPloop;
}
void mlir::scf::naivelyFuseParallelOps(
Region ®ion, llvm::function_ref<bool(Value, Value)> mayAlias) {
OpBuilder b(region);
// Consider every single block and attempt to fuse adjacent loops.
+ SmallVector<SmallVector<ParallelOp>, 1> ploopChains;
for (auto &block : region) {
- SmallVector<SmallVector<ParallelOp, 8>, 1> ploopChains{{}};
+ ploopChains.clear();
----------------
rengolin wrote:
Fair enough. It's not the c-tor/d-tor that makes it slow, it's the promotion to heap, and if it's only done once, it's better.
https://github.com/llvm/llvm-project/pull/75955
More information about the Mlir-commits
mailing list