[Mlir-commits] [mlir] 0b55f94 - [mlir][sparse] replace stack-based access pattern with dyn-alloc
Aart Bik
llvmlistbot at llvm.org
Wed Apr 6 17:10:58 PDT 2022
Author: Aart Bik
Date: 2022-04-06T17:10:43-07:00
New Revision: 0b55f94d2bf3b0de027e6e674c99210d20bd8f7b
URL: https://github.com/llvm/llvm-project/commit/0b55f94d2bf3b0de027e6e674c99210d20bd8f7b
DIFF: https://github.com/llvm/llvm-project/commit/0b55f94d2bf3b0de027e6e674c99210d20bd8f7b.diff
LOG: [mlir][sparse] replace stack-based access pattern with dyn-alloc
Rationale:
Allocating the temporary buffers for access pattern expansion on the stack
(using alloca) is a bit too agressive, since it easily runs out of stack space
for large enveloping tensor dimensions. This revision changes the dynamic
allocation of these buffers with explicit alloc/dealloc pairs.
Reviewed By: bixia, wrengr
Differential Revision: https://reviews.llvm.org/D123253
Added:
mlir/test/Dialect/SparseTensor/sparse_expand.mlir
Modified:
mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
mlir/test/Dialect/SparseTensor/conversion.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index d78061e0d7587..164a7fe0a49a9 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -160,6 +160,16 @@ static Value genAlloca(ConversionPatternRewriter &rewriter, Location loc,
return rewriter.create<memref::AllocaOp>(loc, memTp, ValueRange{sz});
}
+/// Generates an uninitialized buffer of the given size and type,
+/// but returns it as type `memref<? x $tp>` (rather than as type
+/// `memref<$sz x $tp>`). Unlike temporary buffers on the stack,
+/// this buffer must be explicitly deallocated by client.
+static Value genAlloc(ConversionPatternRewriter &rewriter, Location loc,
+ Value sz, Type tp) {
+ auto memTp = MemRefType::get({ShapedType::kDynamicSize}, tp);
+ return rewriter.create<memref::AllocOp>(loc, memTp, ValueRange{sz});
+}
+
/// Generates an uninitialized temporary buffer of the given size and
/// type, but returns it as type `memref<? x $tp>` (rather than as type
/// `memref<$sz x $tp>`).
@@ -761,15 +771,18 @@ class SparseTensorExpandConverter : public OpConversionPattern<ExpandOp> {
auto enc = getSparseTensorEncoding(srcType);
Value src = adaptor.getOperands()[0];
Value sz = genDimSizeCall(rewriter, op, enc, src, srcType.getRank() - 1);
- // Allocate temporary stack buffers for values, filled-switch, and indices.
- Value values = genAlloca(rewriter, loc, sz, eltType);
- Value filled = genAlloca(rewriter, loc, sz, boolType);
- Value indices = genAlloca(rewriter, loc, sz, idxType);
+ // Allocate temporary buffers for values, filled-switch, and indices.
+ // We do not use stack buffers for this, since the expanded size may
+ // be rather large (as it envelops a single expanded dense dimension).
+ Value values = genAlloc(rewriter, loc, sz, eltType);
+ Value filled = genAlloc(rewriter, loc, sz, boolType);
+ Value indices = genAlloc(rewriter, loc, sz, idxType);
Value zero = constantZero(rewriter, loc, idxType);
// Reset the values/filled-switch to all-zero/false. Note that this
// introduces an O(N) operation into the computation, but this reset
// operation is amortized over the innermost loops for the access
- // pattern expansion.
+ // pattern expansion. As noted in the operation doc, we would like
+ // to amortize this setup cost even between kernels.
rewriter.create<linalg::FillOp>(
loc, ValueRange{constantZero(rewriter, loc, eltType)},
ValueRange{values});
@@ -789,6 +802,7 @@ class SparseTensorCompressConverter : public OpConversionPattern<CompressOp> {
LogicalResult
matchAndRewrite(CompressOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
+ Location loc = op->getLoc();
// Note that this method call resets the values/filled-switch back to
// all-zero/false by only iterating over the set elements, so the
// complexity remains proportional to the sparsity of the expanded
@@ -798,6 +812,18 @@ class SparseTensorCompressConverter : public OpConversionPattern<CompressOp> {
TypeRange noTp;
replaceOpWithFuncCall(rewriter, op, name, noTp, adaptor.getOperands(),
EmitCInterface::On);
+ // Deallocate the buffers on exit of the loop nest.
+ Operation *parent = op;
+ for (; isa<scf::ForOp>(parent->getParentOp()) ||
+ isa<scf::WhileOp>(parent->getParentOp()) ||
+ isa<scf::ParallelOp>(parent->getParentOp()) ||
+ isa<scf::IfOp>(parent->getParentOp());
+ parent = parent->getParentOp())
+ ;
+ rewriter.setInsertionPointAfter(parent);
+ rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[2]);
+ rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[3]);
+ rewriter.create<memref::DeallocOp>(loc, adaptor.getOperands()[4]);
return success();
}
};
diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir
index a3078e4913fca..00451c61f293f 100644
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@@ -461,24 +461,31 @@ func @sparse_insert(%arg0: tensor<128xf32, #SparseVector>,
}
// CHECK-LABEL: func @sparse_expansion()
-// %[[S:.*]] = call @sparseDimSize
-// %[[V:.*]] = memref.alloca(%[[S]]) : memref<?xf64>
-// %[[F:.*]] = memref.alloca(%[[S]]) : memref<?xi1>
-// %[[A:.*]] = memref.alloca(%[[S]]) : memref<?xindex>
-// linalg.fill ins(%{{.*}} : f64) outs(%[[V]] : memref<?xf64>)
-// linalg.fill ins(%{{.*}} : i1) outs(%[[F]] : memref<?xi1>)
-// CHECK: return
-func @sparse_expansion() {
+// CHECK: %[[S:.*]] = call @sparseDimSize
+// CHECK: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
+// CHECK: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
+// CHECK: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
+// CHECK-DAG: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
+// CHECK: return %[[C]] : memref<?xindex>
+func @sparse_expansion() -> memref<?xindex> {
%c = arith.constant 8 : index
%0 = sparse_tensor.init [%c, %c] : tensor<8x8xf64, #SparseMatrix>
%values, %filled, %added, %count = sparse_tensor.expand %0
: tensor<8x8xf64, #SparseMatrix> to memref<?xf64>, memref<?xi1>, memref<?xindex>, index
- return
+ return %added : memref<?xindex>
}
// CHECK-LABEL: func @sparse_compression(
-// CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>,
+// CHECK-SAME: %[[A:.*0]]: !llvm.ptr<i8>,
+// CHECK-SAME: %[[B:.*1]]: memref<?xindex>,
+// CHECK-SAME: %[[C:.*2]]: memref<?xf64>,
+// CHECK-SAME: %[[D:.*3]]: memref<?xi1>,
+// CHECK-SAME: %[[E:.*4]]: memref<?xindex>,
// CHECK: call @expInsertF64(%[[A]],
+// CHECK-DAG: memref.dealloc %[[C]] : memref<?xf64>
+// CHECK-DAG: memref.dealloc %[[D]] : memref<?xi1>
+// CHECK-DAG: memref.dealloc %[[E]] : memref<?xindex>
// CHECK: return
func @sparse_compression(%arg0: tensor<8x8xf64, #SparseMatrix>,
%arg1: memref<?xindex>, %arg2: memref<?xf64>, %arg3: memref<?xi1>,
diff --git a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
new file mode 100644
index 0000000000000..63e7ab83d6344
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
@@ -0,0 +1,61 @@
+// RUN: mlir-opt %s -sparsification | \
+// RUN: FileCheck %s --check-prefix=CHECK-SPARSE
+// RUN: mlir-opt %s -sparsification -sparse-tensor-conversion | \
+// RUN: FileCheck %s --check-prefix=CHECK-CONVERT
+
+#DCSC = #sparse_tensor.encoding<{
+ dimLevelType = [ "compressed", "compressed" ],
+ dimOrdering = affine_map<(i,j) -> (j,i)>
+}>
+
+#SV = #sparse_tensor.encoding<{
+ dimLevelType = [ "compressed" ]
+}>
+
+#rowsum = {
+ indexing_maps = [
+ affine_map<(i,j) -> (i,j)>, // A
+ affine_map<(i,j) -> (i)> // x (out)
+ ],
+ iterator_types = ["parallel", "reduction"],
+ doc = "X(i) = SUM A(i,j)"
+}
+
+//
+// CHECK-SPARSE-LABEL: func @kernel(
+// CHECK-SPARSE: %[[A:.*]], %[[B:.*]], %[[C:.*]], %{{.*}} = sparse_tensor.expand
+// CHECK-SPARSE: scf.for
+// CHECK-SPARSE: scf.for
+// CHECK-SPARSE: sparse_tensor.compress %{{.*}}, %{{.*}}, %[[A]], %[[B]], %[[C]]
+// CHECK-SPARSE: %[[RET:.*]] = sparse_tensor.load %{{.*}} hasInserts
+// CHECK-SPARSE: return %[[RET]]
+//
+// CHECK-CONVERT-LABEL: func @kernel(
+// CHECK-CONVERT: %{{.*}} = call @sparseDimSize
+// CHECK-CONVERT: %[[S:.*]] = call @sparseDimSize
+// CHECK-CONVERT: %[[A:.*]] = memref.alloc(%[[S]]) : memref<?xf64>
+// CHECK-CONVERT: %[[B:.*]] = memref.alloc(%[[S]]) : memref<?xi1>
+// CHECK-CONVERT: %[[C:.*]] = memref.alloc(%[[S]]) : memref<?xindex>
+// CHECK-CONVERT: linalg.fill ins(%{{.*}} : f64) outs(%[[A]] : memref<?xf64>)
+// CHECK-CONVERT: linalg.fill ins(%{{.*}} : i1) outs(%[[B]] : memref<?xi1>)
+// CHECK-CONVERT: scf.for
+// CHECK-CONVERT: scf.for
+// CHECK-CONVERT: call @expInsertF64
+// CHECK-CONVERT: memref.dealloc %[[A]] : memref<?xf64>
+// CHECK-CONVERT: memref.dealloc %[[B]] : memref<?xi1>
+// CHECK-CONVERT: memref.dealloc %[[C]] : memref<?xindex>
+// CHECK-CONVERT: call @endInsert
+//
+func @kernel(%arga: tensor<?x?xf64, #DCSC>) -> tensor<?xf64, #SV> {
+ %c0 = arith.constant 0 : index
+ %n = tensor.dim %arga, %c0 : tensor<?x?xf64, #DCSC>
+ %v = sparse_tensor.init [%n] : tensor<?xf64, #SV>
+ %0 = linalg.generic #rowsum
+ ins(%arga: tensor<?x?xf64, #DCSC>)
+ outs(%v: tensor<?xf64, #SV>) {
+ ^bb(%a: f64, %x: f64):
+ %1 = arith.addf %x, %a : f64
+ linalg.yield %1 : f64
+ } -> tensor<?xf64, #SV>
+ return %0 : tensor<?xf64, #SV>
+}
More information about the Mlir-commits
mailing list