[Mlir-commits] [mlir] 372e793 - [mlir][sparse] support affine expression on sparse dimensions (analysis implementation)
Peiming Liu
llvmlistbot at llvm.org
Tue Nov 22 16:02:53 PST 2022
Author: Peiming Liu
Date: 2022-11-23T00:02:47Z
New Revision: 372e7939d738d69e20db28c4433d4e161ed72397
URL: https://github.com/llvm/llvm-project/commit/372e7939d738d69e20db28c4433d4e161ed72397
DIFF: https://github.com/llvm/llvm-project/commit/372e7939d738d69e20db28c4433d4e161ed72397.diff
LOG: [mlir][sparse] support affine expression on sparse dimensions (analysis implementation)
Reviewed By: aartbik
Differential Revision: https://reviews.llvm.org/D138171
Added:
Modified:
mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 7f9f0e9f0e0d2..60829fe99951c 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -148,14 +148,29 @@ struct LatPoint {
/// independently from the basic algorithm if bottlenecks are identified.
class Merger {
public:
- /// Constructs a merger for the given number of tensors and loops. The
- /// user supplies the number of tensors involved in the kernel, with the
- /// last tensor in this set denoting the output tensor. The merger adds an
- /// additional synthetic tensor at the end of this set to represent all
- /// invariant expressions in the kernel.
- Merger(unsigned t, unsigned l)
- : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1), numLoops(l),
- hasSparseOut(false),
+ /// Constructs a merger for the given number of tensors, native loops, and
+ /// filter loops. The user supplies the number of tensors involved in the
+ /// kernel, with the last tensor in this set denoting the output tensor. The
+ /// merger adds an additional synthetic tensor at the end of this set to
+ /// represent all invariant expressions in the kernel.
+ /// In addition to natives
+ /// loops (which are specified by the GenericOp), extra filter loops are
+ /// needed in order to handle affine expressions on sparse dimensions.
+ /// E.g., (d0, d1, d2) => (d0 + d1, d2), a naive implementation of the filter
+ /// loop could be generated as:
+ /// for (coord : sparse_dim[0])
+ /// if (coord == d0 + d1) {
+ /// generated_code;
+ /// }
+ /// }
+ /// to filter out coordinates that are not equal to the affine expression
+ /// result.
+ /// TODO: we want to make the filter loop more efficient in the future, e.g.,
+ /// by avoiding scanning the full stored index sparse (keeping the last
+ /// position in ordered list) or even apply binary search to find the index.
+ Merger(unsigned t, unsigned l, unsigned fl)
+ : outTensor(t - 1), syntheticTensor(t), numTensors(t + 1),
+ numNativeLoops(l), numLoops(l + fl), hasSparseOut(false),
dimTypes(numTensors,
std::vector<DimLevelType>(numLoops, DimLevelType::Undef)),
loopIdxToDim(numTensors,
@@ -231,6 +246,15 @@ class Merger {
/// Bit translation (get loop index).
unsigned index(unsigned b) const { return b / numTensors; }
+ /// Get the number of total loops (native loops + filter loops).
+ unsigned getNumLoops() const { return numLoops; }
+ /// Get the number of native loops.
+ unsigned getNumNativeLoops() const { return numNativeLoops; }
+ /// Get the number of filter loops.
+ unsigned getNumFilterLoops() const { return numLoops - numNativeLoops; }
+ /// Get the starting filter loop index.
+ unsigned getFilterLoopStartingIdx() const { return getNumNativeLoops(); }
+
/// Returns true if bit corresponds to index of output tensor.
bool isOutTensor(unsigned b, unsigned i) const {
return tensor(b) == outTensor && index(b) == i;
@@ -242,6 +266,11 @@ class Merger {
/// expressions).
unsigned getSynTensorID() const { return syntheticTensor; }
+ bool isFilterLoop(unsigned ldx) const {
+ assert(ldx < numLoops);
+ return ldx >= numNativeLoops;
+ }
+
/// Returns true if given tensor iterates *only* in the given tensor
/// expression. For the output tensor, this defines a "simply dynamic"
/// operation [Bik96]. For instance: a(i) *= 2.0 or a(i) += a(i) for
@@ -345,6 +374,7 @@ class Merger {
const unsigned outTensor;
const unsigned syntheticTensor;
const unsigned numTensors;
+ const unsigned numNativeLoops;
const unsigned numLoops;
bool hasSparseOut;
// Map that converts pair<tensor id, loop id> to the corresponding dimension
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
index e94224db2b347..a591bc3bd9b68 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -329,6 +329,12 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
return loop;
}
+Operation *SparseTensorLoopEmitter::enterFilterLoopOverTensorAtDim(
+ OpBuilder &builder, Location loc, size_t tid, size_t dim, AffineExpr affine,
+ MutableArrayRef<Value> reduc) {
+ llvm_unreachable("need to be implemented");
+}
+
void SparseTensorLoopEmitter::genDenseAffineAddressAtCurLevel(
OpBuilder &builder, Location loc, size_t tid, size_t dim,
AffineExpr affine) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
index 2987490e25907..7e6a350b7117b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -406,6 +406,11 @@ class SparseTensorLoopEmitter {
ArrayRef<size_t> extraTids = {},
ArrayRef<size_t> extraDims = {});
+ Operation *enterFilterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
+ size_t tid, size_t dim,
+ AffineExpr affine,
+ MutableArrayRef<Value> reduc = {});
+
void genDenseAffineAddressAtCurLevel(OpBuilder &builder, Location loc,
size_t tid, size_t dim,
AffineExpr affine);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index c9a15252c6ce4..b822a71e228ef 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -155,8 +155,11 @@ static AffineMap permute(MLIRContext *context, AffineMap m,
/// Helper method to inspect affine expressions. Rejects cases where the
/// same index is used more than once. Also rejects compound affine
/// expressions in sparse dimensions.
+/// filterIdx stores the current filter loop idx should be used for the next
+/// compound affine sparse level, and it will be incremented by one when
+/// used.
static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
- AffineExpr a, DimLevelType dlt,
+ AffineExpr a, DimLevelType dlt, unsigned &filterLdx,
bool setLvlFormat = true) {
switch (a.getKind()) {
case AffineExprKind::DimId: {
@@ -169,22 +172,68 @@ static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
return true;
}
case AffineExprKind::Add:
- case AffineExprKind::Mul: {
- if (!isDenseDLT(dlt))
- return false; // compound only in dense dim
- auto binOp = a.cast<AffineBinaryOpExpr>();
- // We do not set dim level format for affine expresssion like d0 + d1 on
- // both loop index at d0 and d1,
- return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, false) &&
- findAffine(merger, tensor, dim, binOp.getRHS(), dlt, false);
+ case AffineExprKind::Mul:
+ case AffineExprKind::Constant: {
+ if (!isDenseDLT(dlt) && setLvlFormat) {
+ assert(isUndefDLT(merger.getDimLevelType(tensor, filterLdx)));
+ // Use a filter loop for sparse affine expression.
+ merger.setDimAndDimLevelType(tensor, filterLdx++, dim, dlt);
+ }
+
+ if (auto binOp = a.dyn_cast<AffineBinaryOpExpr>()) {
+ // We do not set dim level format for affine expresssion like d0 + d1 on
+ // either loop index at d0 or d1.
+ // We continue the recursion merely to check whether current affine is
+ // admissible or not.
+ return findAffine(merger, tensor, dim, binOp.getLHS(), dlt, filterLdx,
+ false) &&
+ findAffine(merger, tensor, dim, binOp.getRHS(), dlt, filterLdx,
+ false);
+ }
+ // Falls through when it is a constant Affine
+ return true;
}
- case AffineExprKind::Constant:
- return isDenseDLT(dlt); // const only in dense dim
default:
return false;
}
}
+/// Get the total number of compound affine expressions in affineMap that are
+/// attached to the given tensor. For the following inputs:
+///
+/// affineMap = (d0, d1, d2) => (d0 + d1, d2)
+/// tensor = ["compressed", "compressed"]
+///
+/// Returns 1 (because the first level is compressed and its corresponding
+/// affineMap is d0 + d1)
+static unsigned getNumCompoundAffineOnSparseDims(AffineMap affineMap,
+ Value tensor) {
+ unsigned num = 0;
+ auto enc = getSparseTensorEncoding(tensor.getType());
+ if (enc) {
+ ArrayRef<AffineExpr> exps = affineMap.getResults();
+ for (unsigned rank = 0; rank < exps.size(); rank++) {
+ auto aidx = toOrigDim(enc, rank);
+ auto affine = exps[aidx];
+ if (!affine.isa<AffineDimExpr>())
+ if (!isDenseDLT(getDimLevelType(enc, rank)))
+ num++;
+ }
+ }
+
+ return num;
+}
+
+/// Get the total number of compound affine expressions attached on a sparse
+/// level in the given GenericOp.
+static unsigned getNumCompoundAffineOnSparseDims(linalg::GenericOp op) {
+ unsigned num = 0;
+ for (OpOperand &t : op->getOpOperands())
+ num += getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(&t),
+ t.get());
+ return num;
+}
+
/// Helper method to inspect sparse encodings in the tensor types.
/// Fills the per-dimension sparsity information for all tensors.
/// Returns true if the sparse annotations and affine subscript
@@ -192,19 +241,22 @@ static bool findAffine(Merger &merger, unsigned tensor, unsigned dim,
/// no annotations are found or inadmissible constructs occur.
static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
bool annotated = false;
+ unsigned filterLdx = merger.getFilterLoopStartingIdx();
for (OpOperand &t : op->getOpOperands()) {
auto map = op.getMatchingIndexingMap(&t);
auto enc = getSparseTensorEncoding(t.get().getType());
if (enc)
annotated = true;
assert(map.getNumResults() == op.getRank(&t));
+
for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
unsigned tensor = t.getOperandNumber();
AffineExpr a = map.getResult(toOrigDim(enc, d));
- if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d)))
+ if (!findAffine(merger, tensor, d, a, getDimLevelType(enc, d), filterLdx))
return false; // inadmissible affine expression
}
}
+ assert(filterLdx == merger.getNumLoops());
return annotated;
}
@@ -214,34 +266,58 @@ static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
/// latest possible index.
static bool topSortOptimal(unsigned n,
ArrayRef<utils::IteratorType> iteratorTypes,
- std::vector<unsigned> &topSort,
+ const Merger &merger, std::vector<unsigned> &topSort,
std::vector<unsigned> &inDegree,
std::vector<std::vector<bool>> &adjM) {
- std::vector<unsigned> redIt; // reduce iterator with 0 degree
- std::vector<unsigned> parIt; // parallel iterator with 0 degree
+ std::vector<unsigned> redIt; // reduce iterator with 0 degree
+ std::vector<unsigned> parIt; // parallel iterator with 0 degree
+ std::vector<unsigned> filterIt; // filter loop with 0 degree
for (unsigned i = 0; i < n; i++) {
if (inDegree[i] == 0) {
- if (linalg::isReductionIterator(iteratorTypes[i]))
+ if (merger.isFilterLoop(i))
+ filterIt.push_back(i);
+ else if (linalg::isReductionIterator(iteratorTypes[i]))
redIt.push_back(i);
else
parIt.push_back(i);
}
}
- while (!redIt.empty() || !parIt.empty()) {
- // We always choose parallel iterator if there is any.
- auto &it = !parIt.empty() ? parIt : redIt;
+ while (!redIt.empty() || !parIt.empty() || !filterIt.empty()) {
+ // We always choose in order of filter loop -> parallel loop -> reduction
+ // loop because
+ // 1. Putting reduction loop early might make the loop sequence
+ // inadmissible.
+ // 2. Filter loops should be put as early as possible for better
+ // performance, since only one (if any) iteration will carry the
+ // computation. E.g., for (1 to N)
+ // for (1 to M)
+ // for (1 to K)
+ // if (xxx)
+ // O(X) computation => O(NMK+NMX) time complexity
+ //
+ // By putting the filter loop one level up, we got
+ //
+ // for (1 to N)
+ // for (1 to K)
+ // if (xxx)
+ // for (1 to M)
+ // O(X) computation => O(NK+NMX) time complexity
+ auto &it = !filterIt.empty() ? filterIt : (!parIt.empty() ? parIt : redIt);
auto src = it.back();
topSort.push_back(src);
it.pop_back();
// Update in-degree, and push 0-degree node into worklist.
- for (unsigned dst = 0; dst < n; dst++)
+ for (unsigned dst = 0; dst < n; dst++) {
if (adjM[src][dst] && --inDegree[dst] == 0) {
- if (linalg::isReductionIterator(iteratorTypes[dst]))
+ if (merger.isFilterLoop(dst))
+ filterIt.push_back(dst);
+ else if (linalg::isReductionIterator(iteratorTypes[dst]))
redIt.push_back(dst);
else
parIt.push_back(dst);
}
+ }
}
return topSort.size() == n;
}
@@ -340,7 +416,7 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
OpOperand *skip = nullptr) {
// Set up an n x n from/to adjacency matrix of the iteration graph
// for the implicit loop indices i_0 .. i_n-1.
- unsigned n = op.getNumLoops();
+ unsigned n = merger.getNumLoops();
std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));
std::vector<unsigned> inDegree(n, 0); // in-degree of each node.
auto iteratorTypes = op.getIteratorTypesArray();
@@ -352,7 +428,7 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
// Get map and encoding.
auto map = op.getMatchingIndexingMap(&t);
auto enc = getSparseTensorEncoding(t.get().getType());
- assert(map.getNumDims() == n);
+ assert(map.getNumDims() + getNumCompoundAffineOnSparseDims(op) == n);
// Skip dense tensor constraints when not requested.
if (!(mask & SortMask::kIncludeDense) && !enc)
continue;
@@ -364,6 +440,19 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
AffineExpr ta = map.getResult(toOrigDim(enc, d));
Optional<unsigned> tldx = merger.getLoopIdx(t.getOperandNumber(), d);
+ // Filter loops should be constructed after all the dependent loops,
+ // i.e., d0 + d1 < filter_loop(d0 + d1)
+ if (tldx && merger.isFilterLoop(tldx.value())) {
+ assert(!ta.isa<AffineDimExpr>() &&
+ !isDenseDLT(getDimLevelType(enc, d)));
+ addAffineOrderings(adjM, inDegree, ta, AffineExpr(), llvm::None, tldx);
+ // Now that the ordering of affine expression is captured by filter
+ // loop idx, we only need to ensure the affine ordering against filter
+ // loop. Thus, we reset the affine express to nil here to mark it as
+ // resolved.
+ ta = AffineExpr();
+ }
+
if (d > 0) {
AffineExpr fa = map.getResult(toOrigDim(enc, d - 1));
Optional<unsigned> fldx =
@@ -377,6 +466,11 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
if (!(mask & SortMask::kIncludeDense))
tryLoosenAffineDenseConstraints(op, fldx, fa, tldx, ta);
+ // (d0 + d1) < (d2 + d3), or
+ // filter_loop_d-1 < (d2 + d3), or
+ // (d0 + d1) < filter_loop_d, or
+ // filter_loop_d-1 < filter_loop_d depending on whether fa/ta is reset
+ // above.
addAffineOrderings(adjM, inDegree, fa, ta, fldx, tldx);
}
}
@@ -402,7 +496,7 @@ static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
// Report failure for a cyclic iteration graph.
topSort.clear();
topSort.reserve(n);
- return topSortOptimal(n, iteratorTypes, topSort, inDegree, adjM);
+ return topSortOptimal(n, iteratorTypes, merger, topSort, inDegree, adjM);
}
/// Returns true if tensor materializes uninitialized into the computation.
@@ -430,9 +524,8 @@ static bool isAdmissibleTensorExp(Merger &merger, linalg::GenericOp op,
// An all-dense annotated "sparse" output tensor becomes a linearized random
// access 1-dim memref. Also admissible since insertions cannot occur.
bool allDense = true;
- auto iteratorTypes = op.getIteratorTypesArray();
- unsigned numLoops = iteratorTypes.size();
- for (unsigned i = 0; i < numLoops; i++)
+ unsigned numLoops = merger.getNumLoops(); // numNativeLoops + numFilterLoops
+ for (unsigned i = 0; i < merger.getNumLoops(); i++)
if (isCompressedDLT(merger.getDimLevelType(tensor, i)) ||
isSingletonDLT(merger.getDimLevelType(tensor, i))) {
allDense = false;
@@ -443,19 +536,31 @@ static bool isAdmissibleTensorExp(Merger &merger, linalg::GenericOp op,
}
if (allDense)
return true;
+
+ // TODO: support compound affine expression on sparse output.
+ if (getNumCompoundAffineOnSparseDims(op.getMatchingIndexingMap(lhs),
+ lhs->get()) != 0)
+ return false;
+
// A tensor expression with a sparse output tensor that changes its values
// but not its nonzero structure, an operation called "simply dynamic" in
// [Bik96,Ch9], is also admissible without special codegen.
if (merger.isSingleCondition(tensor, exp))
return true;
+
// Accept "truly dynamic" if the output tensor materializes uninitialized
// into the computation and insertions occur in lexicographic index order.
if (isMaterializing(lhs->get())) {
+ auto iteratorTypes = op.getIteratorTypesArray();
unsigned nest = 0;
for (unsigned i = 0; i < numLoops; i++) {
- if (linalg::isReductionIterator(iteratorTypes[topSort[i]]))
- break; // terminate at first reduction
- nest++;
+ if (!merger.isFilterLoop(topSort[i])) {
+ // We only count non-filter loops as filter loops should be considered
+ // as a special type of parallel loops.
+ if (linalg::isReductionIterator(iteratorTypes[topSort[i]]))
+ break; // terminate at first reduction
+ nest++;
+ }
}
// Determine admissible dynamic insertion situations:
// (1) fully injective, since there are no reductions,
@@ -878,7 +983,14 @@ static void genInvariants(Merger &merger, CodeGen &codegen, OpBuilder &builder,
auto enc = getSparseTensorEncoding(t.get().getType());
for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
AffineExpr a = map.getResult(toOrigDim(enc, d));
- if (!isInvariantAffine(codegen, a, ldx, atLevel))
+ Optional<unsigned> sldx = merger.getLoopIdx(t.getOperandNumber(), d);
+ if (sldx && merger.isFilterLoop(sldx.value())) {
+ if (!codegen.getLoopIdxValue(sldx.value()))
+ // The filter loops has not been constructed.
+ return;
+ if (sldx.value() == ldx)
+ atLevel = true;
+ } else if (!isInvariantAffine(codegen, a, ldx, atLevel))
return; // still in play
}
// All exhausted at this level (atLevel denotes exactly at this level).
@@ -1003,6 +1115,16 @@ static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,
Operation *loop =
genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
+ if (merger.isFilterLoop(idx)) {
+ assert(isSparse);
+ OpOperand *t = &op->getOpOperand(tid);
+ auto enc = getSparseTensorEncoding(t->get().getType());
+ // Retrieves the affine expression for the filter loop.
+ AffineExpr a =
+ op.getMatchingIndexingMap(t).getResult(toOrigDim(enc, dim));
+ return codegen.loopEmitter.enterFilterLoopOverTensorAtDim(
+ builder, loc, tid, dim, a, reduc);
+ }
return codegen.loopEmitter.enterLoopOverTensorAtDim(
builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims);
}).value();
@@ -1488,7 +1610,8 @@ struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
return failure();
unsigned numTensors = op->getNumOperands();
unsigned numLoops = op.getNumLoops();
- Merger merger(numTensors, numLoops);
+ unsigned numFilterLoops = getNumCompoundAffineOnSparseDims(op);
+ Merger merger(numTensors, numLoops, numFilterLoops);
if (!findSparseAnnotations(merger, op))
return failure();
diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
index d2748460a094a..929900142d272 100644
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -127,7 +127,7 @@ class MergerTestBase : public ::testing::Test {
protected:
MergerTestBase(unsigned numTensors, unsigned numLoops)
: numTensors(numTensors), numLoops(numLoops),
- merger(numTensors, numLoops) {}
+ merger(numTensors, numLoops, /*numFilterLoops=*/0) {}
///
/// Expression construction helpers.
More information about the Mlir-commits
mailing list