[Mlir-commits] [mlir] [mlir][sparse] support sparsification to coiterate operations. (PR #102546)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Aug 8 15:55:45 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: Peiming Liu (PeimingLiu)
<details>
<summary>Changes</summary>
The PR supports sparsifying sparse kernel into `sparse_tensor.coiterate` operations, a higher level abstraction for coiterating multiple sparse spaces.
Lowering from `sparse_tensor.coiterate` to `scf.for`/`scf.while` are still missing, hence no e2e demo is available yet.
The lowering path is still considered to be experimental and incomplete.
---
Patch is 24.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102546.diff
8 Files Affected:
- (modified) mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td (+4)
- (modified) mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp (+16)
- (modified) mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp (+1-1)
- (modified) mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp (+91-33)
- (modified) mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.h (+4)
- (modified) mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp (+105-22)
- (modified) mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h (+8-4)
- (modified) mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir (+54-2)
``````````diff
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index 6e17f804993e2a..d6e27dc0f75a4b 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -1749,6 +1749,10 @@ def SparseTensor_CoIterateOp : SparseTensor_Op<"coiterate",
let results = (outs Variadic<AnyType>:$results);
let regions = (region VariadicRegion<SizedRegion<1>>:$caseRegions);
+ let builders = [
+ OpBuilder<(ins "ValueRange":$iterSpace, "ValueRange":$initArgs, "unsigned":$numCases)>,
+ ];
+
let extraClassDeclaration = [{
unsigned getSpaceDim() {
return llvm::cast<::mlir::sparse_tensor::IterSpaceType>(
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 1135ea32fe1abb..277f4c81564ecd 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -2594,6 +2594,22 @@ void IterateOp::getSuccessorRegions(RegionBranchPoint point,
regions.push_back(RegionSuccessor(getResults()));
}
+void CoIterateOp::build(OpBuilder &builder, OperationState &odsState,
+ ValueRange iterSpaces, ValueRange initArgs,
+ unsigned numCases) {
+ unsigned rank =
+ cast<IterSpaceType>(iterSpaces.front().getType()).getSpaceDim();
+ // All ones.
+ I64BitSet set((1 << rank) - 1);
+ // Fake cases bits. We need to preallocate all the regions as Region can not
+ // be dynamically added later after the operation is created.
+ SmallVector<int64_t> caseBits(numCases, 0);
+ ArrayAttr cases = builder.getI64ArrayAttr(caseBits);
+ return CoIterateOp::build(builder, odsState, initArgs.getTypes(), iterSpaces,
+ initArgs, set, cases,
+ /*caseRegionsCount=*/numCases);
+}
+
ParseResult CoIterateOp::parse(OpAsmParser &parser, OperationState &result) {
SmallVector<Value> spaces;
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 5fb009e3eebe66..cc372ed1be6217 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -1395,7 +1395,7 @@ struct ForeachRewriter : public OpRewritePattern<ForeachOp> {
loopEmitter.enterNewLoopSeq(rewriter, loc, tidLvls);
// Note that reduc will be taken care of by loop emitter and get updated
// in place.
- loopEmitter.enterCoIterationOverTensorsAtLvls(rewriter, loc, tidLvls,
+ loopEmitter.enterCoIterationOverTensorsAtLvls(rewriter, loc, tidLvls, 1,
reduc);
}
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index 08fc104fcbeead..b4575592a8ff2f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -842,11 +842,13 @@ static bool shouldTryParallize(CodegenEnv &env, LoopId curr,
/// one sparse level in the list.
static Operation *genCoIteration(CodegenEnv &env, OpBuilder &builder,
ArrayRef<TensorLevel> tidLvls,
- bool tryParallel, bool needsUniv) {
+ unsigned numCases, bool tryParallel,
+ bool needsUniv) {
Operation *loop = *env.genLoopBoundary([&](MutableArrayRef<Value> reduc) {
// Construct while-loop with a parameter for each index.
return env.emitter().enterCoIterationOverTensorsAtLvls(
- builder, env.op().getLoc(), tidLvls, reduc, tryParallel, needsUniv);
+ builder, env.op().getLoc(), tidLvls, numCases, reduc, tryParallel,
+ needsUniv);
});
assert(loop);
return loop;
@@ -855,9 +857,11 @@ static Operation *genCoIteration(CodegenEnv &env, OpBuilder &builder,
/// Generates a for-loop or a while-loop, depending on whether it implements
/// singleton iteration or co-iteration over the given conjunction.
static Operation *genLoop(CodegenEnv &env, OpBuilder &builder, LoopId curr,
- bool needsUniv, ArrayRef<TensorLevel> tidLvls) {
+ unsigned numCases, bool needsUniv,
+ ArrayRef<TensorLevel> tidLvls) {
bool tryParallel = shouldTryParallize(env, curr, tidLvls);
- return genCoIteration(env, builder, tidLvls, tryParallel, needsUniv);
+ return genCoIteration(env, builder, tidLvls, numCases, tryParallel,
+ needsUniv);
}
/// Generates the induction structure for a while-loop.
@@ -900,6 +904,26 @@ static void finalizeWhileOp(CodegenEnv &env, OpBuilder &builder,
// basic block where scf::Yield should be inserted.
}
+/// Generate a case region in the coiterate operation.
+static void genCoIterationCase(CodegenEnv &env, OpBuilder &builder,
+ unsigned caseIdx, LatPointId allCase,
+ LatPointId curCase,
+ MutableArrayRef<Value> reduc) {
+ assert(allCase == curCase || env.merger().latGT(allCase, curCase));
+ const BitVector &allCaseBits = env.merger().lat(allCase).simple;
+ const BitVector &curCaseBits = env.merger().lat(curCase).simple;
+
+ /// Computes the subset of iterators that are valid in the current case being
+ /// generated.
+ I64BitSet caseBit(0);
+ for (auto [idx, set] : llvm::enumerate(allCaseBits.set_bits()))
+ if (curCaseBits.test(set))
+ caseBit.set(idx);
+
+ env.emitter().enterCurCoIterationCase(builder, env.op().getLoc(), caseBit,
+ caseIdx, reduc);
+}
+
/// Generates a single if-statement within a while-loop.
static scf::IfOp genIf(CodegenEnv &env, OpBuilder &builder, LoopId curr,
LatPointId p) {
@@ -1175,7 +1199,10 @@ static bool translateBitsToTidLvlPairs(
/// Starts a single loop in current sequence.
static std::pair<Operation *, bool> startLoop(CodegenEnv &env,
OpBuilder &builder, LoopId curr,
- LatPointId li, bool needsUniv) {
+ LatPointId li, unsigned numCases,
+ bool needsUniv) {
+ // TODO: numCases only used when generating iterator-based loops. Cleanup
+ // after fully migration.
// The set of tensors + lvls to generate loops on
SmallVector<TensorLevel> tidLvls;
@@ -1186,7 +1213,7 @@ static std::pair<Operation *, bool> startLoop(CodegenEnv &env,
translateBitsToTidLvlPairs(env, li, curr, tidLvls, affineTidLvls);
// Emit the for/while-loop control.
- Operation *loop = genLoop(env, builder, curr, needsUniv, tidLvls);
+ Operation *loop = genLoop(env, builder, curr, numCases, needsUniv, tidLvls);
Location loc = env.op().getLoc();
for (auto [tidLvl, exp] : affineTidLvls) {
env.emitter().locateLvlAtAffineAddress(builder, loc, tidLvl, exp);
@@ -1259,42 +1286,73 @@ static void genStmt(CodegenEnv &env, RewriterBase &rewriter, ExprId exp,
// Start a loop sequence.
bool needsUniv = startLoopSeq(env, rewriter, exp, curr, lts);
- // Emit a loop for every lattice point L0 >= Li in this loop sequence.
- // We cannot change this to `for (const LatPointId li : env.set(lts))`
- // because the loop body causes data-movement which invalidates
- // the iterator.
+ // When using sparse-iterator-based loops, we only need one loops, as
+ // opposed to a loop sequence, to cover all the iterator spaces.
const unsigned lsize = env.set(lts).size();
- for (unsigned i = 0; i < lsize; i++) {
- const LatPointId li = env.set(lts)[i];
- // Start a loop.
- auto [loop, isSingleCond] = startLoop(env, rewriter, curr, li, needsUniv);
-
- // Visit all lattices points with Li >= Lj to generate the
- // loop-body, possibly with if statements for coiteration.
- Value redInput = env.getReduc();
- Value cntInput = env.getExpandCount();
- Value insInput = env.getInsertionChain();
- Value validIns = env.getValidLexInsert();
- // We cannot change this to `for (const LatPointId lj : env.set(lts))`
- // because the loop body causes data-movement which invalidates the
- // iterator.
+ if (env.generatingSparseIterator()) {
+ // Get the largest lattice point and start a loop.
+ const LatPointId li = env.set(lts)[0];
+ auto [loop, isSingleCond] =
+ startLoop(env, rewriter, curr, li, lsize, needsUniv);
+ assert(isSingleCond == llvm::isa<IterateOp>(loop));
+ // We cannot change this to `for (const LatPointId li : env.set(lts))`
+ // because the loop body causes data-movement which invalidates
+ // the iterator.
for (unsigned j = 0; j < lsize; j++) {
const LatPointId lj = env.set(lts)[j];
const ExprId ej = env.lat(lj).exp;
- if (li == lj || env.merger().latGT(li, lj)) {
- // Recurse into body of each branch.
- if (!isSingleCond) {
- scf::IfOp ifOp = genIf(env, rewriter, curr, lj);
- genStmt(env, rewriter, ej, curr + 1);
- endIf(env, rewriter, ifOp, redInput, cntInput, insInput, validIns);
- } else {
+ // Recurse into body of each branch.
+ if (!isSingleCond) {
+ env.genLoopBoundary([&, curr, j, li, lj](MutableArrayRef<Value> reduc) {
+ genCoIterationCase(env, rewriter, /*caseIdx*/ j, li, lj, reduc);
genStmt(env, rewriter, ej, curr + 1);
- }
+ // TODO: handle yield values.
+ assert(reduc.empty() && "Not Implemented");
+ rewriter.create<sparse_tensor::YieldOp>(env.op().getLoc());
+ return std::nullopt;
+ });
+ // endIf(env, rewriter, ifOp, redInput, cntInput, insInput, validIns);
+ } else {
+ genStmt(env, rewriter, ej, curr + 1);
}
}
-
// End a loop.
needsUniv = endLoop(env, rewriter, loop, curr, needsUniv, isSingleCond);
+ } else {
+ // Emit a loop for every lattice point L0 >= Li in this loop sequence.
+ for (unsigned i = 0; i < lsize; i++) {
+ const LatPointId li = env.set(lts)[i];
+ // Start a loop.
+ auto [loop, isSingleCond] =
+ startLoop(env, rewriter, curr, li, lsize, needsUniv);
+
+ // Visit all lattices points with Li >= Lj to generate the
+ // loop-body, possibly with if statements for coiteration.
+ Value redInput = env.getReduc();
+ Value cntInput = env.getExpandCount();
+ Value insInput = env.getInsertionChain();
+ Value validIns = env.getValidLexInsert();
+ // We cannot change this to `for (const LatPointId lj : env.set(lts))`
+ // because the loop body causes data-movement which invalidates the
+ // iterator.
+ for (unsigned j = 0; j < lsize; j++) {
+ const LatPointId lj = env.set(lts)[j];
+ const ExprId ej = env.lat(lj).exp;
+ if (li == lj || env.merger().latGT(li, lj)) {
+ // Recurse into body of each branch.
+ if (!isSingleCond) {
+ scf::IfOp ifOp = genIf(env, rewriter, curr, lj);
+ genStmt(env, rewriter, ej, curr + 1);
+ endIf(env, rewriter, ifOp, redInput, cntInput, insInput, validIns);
+ } else {
+ genStmt(env, rewriter, ej, curr + 1);
+ }
+ }
+ }
+
+ // End a loop.
+ needsUniv = endLoop(env, rewriter, loop, curr, needsUniv, isSingleCond);
+ }
}
// End a loop sequence.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.h
index d69ae53fb0f298..34b793ee11e4ca 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.h
@@ -49,6 +49,10 @@ class CodegenEnv {
linalg::GenericOp op() const { return linalgOp; }
const SparsificationOptions &options() const { return sparseOptions; }
+ bool generatingSparseIterator() const {
+ return sparseOptions.sparseEmitStrategy ==
+ SparseEmitStrategy::kSparseIterator;
+ }
Merger &merger() { return latticeMerger; }
LoopEmitter &emitter() { return loopEmitter; }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index 2be0193f0de83e..0dce63fe593289 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -615,33 +615,104 @@ bool LoopEmitter::shouldIteratedByForLoop(ArrayRef<SparseIterator *> spIters) {
return true;
}
+Region *LoopEmitter::enterCurCoIterationCase(OpBuilder &builder, Location loc,
+ I64BitSet caseBit,
+ unsigned caseIdx,
+ MutableArrayRef<Value> reduc) {
+ auto coIterOp = cast<CoIterateOp>(loopStack.back().loop);
+ SmallVector<Attribute> cases(coIterOp.getCases().getAsRange<Attribute>());
+ cases[caseIdx] = builder.getI64IntegerAttr(caseBit);
+
+ coIterOp.setCasesAttr(builder.getArrayAttr(cases));
+ Region &caseRegion = coIterOp.getRegion(caseIdx);
+ assert(caseRegion.getBlocks().empty() &&
+ "re-initialize the same coiteration case region.");
+
+ // Each block starts with a list of used coordinates of index type.
+ SmallVector<Type> blockArgTps(coIterOp.getCrdUsedLvls().count(),
+ builder.getIndexType());
+ // Follows by a list of user-provided iteration arguments.
+ TypeRange iterArgsTps = coIterOp.getInitArgs().getTypes();
+ blockArgTps.append(iterArgsTps.begin(), iterArgsTps.end());
+ // Ends with a set of iterators that defines the actually iteration space.
+ for (auto i : caseBit.bits()) {
+ blockArgTps.push_back(
+ cast<IterSpaceType>(coIterOp.getIterSpaces()[i].getType())
+ .getIteratorType());
+ }
+ SmallVector<Location> locs(blockArgTps.size(), loc);
+ caseRegion.emplaceBlock().addArguments(blockArgTps, locs);
+
+ // Entering the new region scope, updating the SSA chain.
+ builder.setInsertionPointToStart(&caseRegion.front());
+ // Update the coordinates.
+ loopStack.back().iv = coIterOp.getCrds(caseIdx).front();
+ // Updates loop iteration arguments.
+ ValueRange iterArgs = coIterOp.getRegionIterArgs(caseIdx);
+ llvm::copy(iterArgs, reduc.begin());
+ // Updates sparse iterator values.
+ ValueRange iters = coIterOp.getRegionIterators(caseIdx);
+ ArrayRef<TensorLevel> tidLvls = loopStack.back().tidLvls;
+ for (auto [i, tl] : llvm::enumerate(unpackTensorLevelRange(tidLvls))) {
+ if (caseBit[i]) {
+ spIterVals[tl.first][tl.second] = iters.front();
+ iters = iters.drop_front();
+ } else {
+ spIterVals[tl.first][tl.second] = nullptr;
+ }
+ }
+ // Must have consumed all iterator SSA values.
+ assert(iters.empty());
+ return &caseRegion;
+}
+
Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls(
OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls,
- MutableArrayRef<Value> reduc, bool tryParallel, bool needsUniv) {
-
+ unsigned numCases, MutableArrayRef<Value> reduc, bool tryParallel,
+ bool needsUniv) {
+ // TODO: Argument `numCases` only used when generating iterator-based sparse
+ // loops. Simplify the code upon feature complete.
// TODO: handle coiteration with sparse iterator.
if (emitStrategy == SparseEmitStrategy::kSparseIterator) {
- assert(tidLvls.size() == 1);
- auto [tid, lvl] = unpackTensorLevel(tidLvls.front());
- Value t = tensors[tid];
-
- // Extract and iterate over the iteration space.
- ExtractIterSpaceOp extractSpaceOp =
- lvl == 0 ? builder.create<ExtractIterSpaceOp>(loc, t)
- : builder.create<ExtractIterSpaceOp>(
- loc, t, spIterVals[tid][lvl - 1], lvl);
-
- IterateOp iterOp = builder.create<IterateOp>(
- loc, extractSpaceOp.getExtractedSpace(), reduc);
- spIterVals[tid][lvl] = iterOp.getIterator();
+ if (tidLvls.size() == 1) {
+ auto [tid, lvl] = unpackTensorLevel(tidLvls.front());
+ Value t = tensors[tid];
+
+ // Extract and iterate over the iteration space.
+ ExtractIterSpaceOp extractSpaceOp =
+ lvl == 0 ? builder.create<ExtractIterSpaceOp>(loc, t)
+ : builder.create<ExtractIterSpaceOp>(
+ loc, t, spIterVals[tid][lvl - 1], lvl);
+
+ IterateOp iterOp = builder.create<IterateOp>(
+ loc, extractSpaceOp.getExtractedSpace(), reduc);
+ spIterVals[tid][lvl] = iterOp.getIterator();
+
+ // Update the reduction varaibles.
+ llvm::copy(iterOp.getRegionIterArgs(), reduc.begin());
+ // Set the insertion point to loop body.
+ builder.setInsertionPointToStart(iterOp.getBody());
+ loopStack.emplace_back(tidLvls, iterOp, builder.getInsertionBlock(),
+ iterOp.getCrds().front(), loopTag);
+ return iterOp;
+ }
- // Update the reduction varaibles.
- llvm::copy(iterOp.getRegionIterArgs(), reduc.begin());
- // Set the insertion point to loop body.
- builder.setInsertionPointToStart(iterOp.getBody());
- loopStack.emplace_back(tidLvls, iterOp, builder.getInsertionBlock(),
- iterOp.getIterator(), loopTag);
- return iterOp;
+ // CoIteration Loops.
+ SmallVector<Value> spaces;
+ for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
+ Value t = tensors[tid];
+ ExtractIterSpaceOp extractSpaceOp =
+ lvl == 0 ? builder.create<ExtractIterSpaceOp>(loc, t)
+ : builder.create<ExtractIterSpaceOp>(
+ loc, t, spIterVals[tid][lvl - 1], lvl);
+ spaces.push_back(extractSpaceOp.getExtractedSpace());
+ }
+ auto coIterOp = builder.create<CoIterateOp>(loc, spaces, reduc, numCases);
+ // The CoIterationOp does not have insertion block nor induction variable.
+ // TODO: the `struct LoopInfo` should be simplied after full migration.
+ loopStack.emplace_back(tidLvls, coIterOp, /*insertion block*/ nullptr,
+ /*induction variable*/ nullptr, loopTag);
+ return coIterOp;
}
// TODO: support multiple return on parallel for?
@@ -866,6 +937,18 @@ void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc,
// Clean up the values, it would help use to discover potential bug at a
// earlier stage (instead of silently using a wrong value).
const LoopInfo &loopInfo = loopStack.back();
+ if (emitStrategy == SparseEmitStrategy::kSparseIterator) {
+ Operation *p = loopInfo.loop;
+ if (isa<IterateOp>(p))
+ rewriter.create<sparse_tensor::YieldOp>(loc, reduc);
+
+ // Exit the loop.
+ rewriter.setInsertionPointAfter(p);
+ // In-place update reduction variables.
+ llvm::copy(p->getResults(), reduc.begin());
+ loopStack.pop_back();
+ return;
+ }
// Sets the insertion point to the right position.
rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
index f3e73e4692c1fd..4dc594118ad21c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
@@ -145,8 +145,12 @@ class LoopEmitter {
/// return the reduction variable used inside the generated loop.
Operation *enterCoIterationOverTensorsAtLvls(
OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls,
- MutableArrayRef<Value> reduc = {}, bool isParallel = false,
- bool needsUniv = false);
+ unsigned numCases, MutableArrayRef<Value> reduc = {},
+ bool isParallel = false, bool needsUniv = false);
+
+ Region *enterCurCoIterationCase(OpBuilder &builder, Location loc,
+ I64BitSet caseBit, unsigned caseIdx,
+ MutableArrayRef<Value> reduc);
/// Generates code to exit the current loop (e.g., generates yields, forwards
...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/102546
More information about the Mlir-commits
mailing list