[Mlir-commits] [mlir] 07bd5f2 - [mlir][sparse] Support strided convolution on compressed level.
Peiming Liu
llvmlistbot at llvm.org
Wed Aug 30 12:38:01 PDT 2023
Author: Peiming Liu
Date: 2023-08-30T19:37:50Z
New Revision: 07bd5f20bcd36d3a906820b27fde41f7cf0420ef
URL: https://github.com/llvm/llvm-project/commit/07bd5f20bcd36d3a906820b27fde41f7cf0420ef
DIFF: https://github.com/llvm/llvm-project/commit/07bd5f20bcd36d3a906820b27fde41f7cf0420ef.diff
LOG: [mlir][sparse] Support strided convolution on compressed level.
Reviewed By: aartbik
Differential Revision: https://reviews.llvm.org/D158912
Added:
mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_strided_conv_2d_nhwc_hwcf.mlir
Modified:
mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
index 441f29dedcdafb..7739d51ed40bf3 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
@@ -17,6 +17,7 @@
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
using namespace mlir;
using namespace mlir::sparse_tensor;
@@ -35,6 +36,8 @@ using namespace mlir::sparse_tensor;
#define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs)))
#define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs)))
#define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs)))
+#define REMUI(lhs, rhs) (builder.create<arith::RemUIOp>(loc, (lhs), (rhs)))
+#define DIVUI(lhs, rhs) (builder.create<arith::DivUIOp>(loc, (lhs), (rhs)))
#define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r)))
//===----------------------------------------------------------------------===//
@@ -117,8 +120,8 @@ static std::pair<Value, Value> fromSliceCrd(OpBuilder &builder, Location loc,
Level lvl) {
// sliceCrd = (tensorCrd - offset) / stride
crd = SUBI(crd, offset);
- Value rem = builder.create<arith::RemUIOp>(loc, crd, stride);
- crd = builder.create<arith::DivUIOp>(loc, crd, stride);
+ Value rem = REMUI(crd, stride);
+ crd = DIVUI(crd, stride);
return std::make_pair(crd, rem);
}
@@ -725,6 +728,7 @@ Value LoopEmitter::genWhileLoopConditions(OpBuilder &builder, Location loc,
}
case LoopCondKind::SparseAffineCond: {
assert(ivs.size() == 1);
+
Value crdHi; // loop upper bound
{
OpBuilder::InsertionGuard guard(builder);
@@ -732,9 +736,9 @@ Value LoopEmitter::genWhileLoopConditions(OpBuilder &builder, Location loc,
// crdHi is a loop invariant, hosit the computation outside the loop.
if (llvm::isa_and_nonnull<scf::WhileOp>(loop))
builder.setInsertionPoint(loop);
- auto [size, stride] = sliceMeta[tid][lvl].back();
+ auto [remSz, stride] = sliceMeta[tid][lvl].back();
assert(stride == 1 && "Not yet implemented");
- crdHi = ADDI(getMostRecentSliceOnLvl(tid, lvl).offset, size);
+ crdHi = ADDI(getMostRecentSliceOnLvl(tid, lvl).offset, remSz);
}
assert(crdHi);
return genSparseReducedAffineCond(builder, loc,
@@ -792,18 +796,33 @@ std::optional<Value> LoopEmitter::genWhileLoopBody(OpBuilder &builder,
return std::nullopt;
}
case LoopCondKind::SparseAffineUnRedCond: {
+ unsigned depth = sliceStack[tid].back().depth;
+ unsigned curStride = sliceMeta[tid][lvl][depth - 1].second;
assert(ivs.size() == 3);
- // Coord is the relative offset related to its parents.
- // Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1]
- assert(sliceStack[tid].back().depth == 1 && "TODO: not yet implement");
+
// Updates the current slice info
SliceInfo &sliceInfo = sliceStack[tid].back();
sliceInfo.isNonEmpty = ivs[0];
sliceInfo.minCrd = ivs[1];
sliceInfo.offset = ivs[2];
- coords[tid][lvl] = sliceInfo.offset;
+
+ // Crd (the value we used to coiterate) is the relative offset related to
+ // its parents, we can use the absolute offset here because when depth = 1,
+ // absOffset[lvl][depth - 1] always equals zero.
+ // TODO: Update crd =absOffset[lvl][depth] - absOffset[lvl][depth - 1]
+ assert(depth == 1 && "TODO: not yet implement");
+ Value crd = sliceInfo.offset;
+
+ Value onStride = constantI1(builder, loc, true);
+ if (curStride != 1) {
+ Value strideVal = C_IDX(curStride);
+ Value rem = REMUI(crd, strideVal);
+ crd = DIVUI(crd, strideVal);
+ onStride = CMPI(eq, rem, C_IDX(0));
+ }
+ coords[tid][lvl] = crd;
// No extra check is needed before accessing the tensor level.
- return std::nullopt;
+ return onStride;
}
default:
llvm_unreachable("Unhandled LoopCondKind");
@@ -814,11 +833,44 @@ std::optional<Value> LoopEmitter::genWhileLoopBody(OpBuilder &builder,
ValueRange LoopEmitter::genCheckedValue(OpBuilder &builder, Location loc,
Value pred, ValueRange curArgs,
TensorLvlCond cond) {
- // Currently only sparse slice condition need extra check.
- assert(isSliceCond(cond.second) && isSparseCond(cond.second));
- assert(curArgs.size() == 1);
- Value nextPos = ADDI(curArgs.front(), C_IDX(1));
- return SELECT(pred, curArgs.front(), nextPos)->getResults();
+ assert(isSparseCond(cond.second));
+ auto [tid, lvl] = unpackTensorLevel(cond.first);
+ if (isAffineIdxUnRedCond(cond.second)) {
+ unsigned depth = sliceStack[tid].back().depth;
+ unsigned curStride = sliceMeta[tid][lvl][depth - 1].second;
+ if (curStride == 1)
+ return curArgs;
+ // Build
+ // if (onStride) {
+ // yield curSlice
+ // } else {
+ // yield nxSlice.
+ //}
+ assert(curArgs.size() == 3);
+ auto ifOp = builder.create<scf::IfOp>(loc, curArgs.getTypes(), pred, true);
+ {
+ OpBuilder::InsertionGuard guard(builder);
+ // If not all slices are legit, yield the updated value.
+ builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+
+ YIELD(curArgs);
+ // If not all slices are legit, yield the updated value.
+ builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
+ auto [nonEmpty, minCrd, offset] =
+ genSliceNextInduction(builder, loc, tid, lvl);
+ SmallVector<Value> nxSlice{nonEmpty, minCrd, offset};
+ YIELD(nxSlice);
+ }
+ // If all slices are legit, start the user generated code.
+ return ifOp.getResults();
+ } else {
+ // Currently only sparse slice condition need extra check.
+ assert(isSliceCond(cond.second) && isSparseCond(cond.second));
+ assert(curArgs.size() == 1);
+ Value nextPos = ADDI(curArgs.front(), C_IDX(1));
+ return SELECT(pred, curArgs.front(), nextPos)->getResults();
+ }
+ llvm_unreachable("unhandled case");
}
std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
@@ -1877,9 +1929,6 @@ void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
TensorId tid, Level lvl) {
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
unsigned depth = levelReducedDep[tid][lvl];
- // TODO: handle case when the current slice stride is not one.
- assert(sliceMeta[tid][lvl][depth].second == 1 && "Not yet implemented");
-
// The remaining slice size after reduction.
Value remSz = sliceMeta[tid][lvl][depth + 1].first;
// Dense slice begin is trivial
@@ -2251,8 +2300,6 @@ LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc,
// FIXME: compute relative offset.
assert(info.depth - 1 == 0);
- Value nextRelOffset = nextAbsOffset;
- nextRelOffset = SELECT(nextNonEmpty, nextRelOffset, c0);
return std::make_tuple(nextNonEmpty, nextMinCrd, nextAbsOffset);
}
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
index d9948d3f4db73b..c6518decbdee0a 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
@@ -393,7 +393,7 @@ class LoopEmitter {
}
static bool isTrivalIdxCond(LoopCondKind k) { return !isAffineIdxCond(k); }
- /// Whether the affine index expression is not fully reduced.
+ /// Whether the affine index expression is fully reduced.
static bool isAffineIdxUnRedCond(LoopCondKind k) {
return isAffineIdxCond(k) && static_cast<uint8_t>(k) & kAffineIdxCondUnRed;
}
@@ -405,7 +405,7 @@ class LoopEmitter {
// E.g., to iterate over sparse tensor slice, we need to check whether the
// current cooridnate is on the slice (e.g., due to stride) or not.
static bool isCondWithExtraCheck(LoopCondKind k) {
- return isSparseCond(k) && isSliceCond(k);
+ return isSparseCond(k) && (isSliceCond(k) || isAffineIdxUnRedCond(k));
}
static LoopCondKind makeLoopCondKind(bool isSparse, bool isSlice,
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_strided_conv_2d_nhwc_hwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_strided_conv_2d_nhwc_hwcf.mlir
new file mode 100644
index 00000000000000..77d9f8499318ab
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_strided_conv_2d_nhwc_hwcf.mlir
@@ -0,0 +1,102 @@
+//--------------------------------------------------------------------------------------------------
+// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
+//
+// Set-up that's shared across all tests in this directory. In principle, this
+// config could be moved to lit.local.cfg. However, there are downstream users that
+// do not use these LIT config files. Hence why this is kept inline.
+//
+// DEFINE: %{sparse_compiler_opts} = enable-runtime-library=true
+// DEFINE: %{sparse_compiler_opts_sve} = enable-arm-sve=true %{sparse_compiler_opts}
+// DEFINE: %{compile} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts}"
+// DEFINE: %{compile_sve} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts_sve}"
+// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
+// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
+// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
+//
+// DEFINE: %{env} =
+//--------------------------------------------------------------------------------------------------
+
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and VLA vectorization.
+// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
+
+#CCCC = #sparse_tensor.encoding<{
+ lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ]
+}>
+
+// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
+func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
+ %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor<?x?x?x?xf32>
+ %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+ return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nhwc_hwcf(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+ %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
+ strides = dense<2> : tensor<2xi64>}
+ ins (%arg0, %arg1: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
+ outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+ return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nhwc_hwcf_CCCC(%arg0: tensor<?x?x?x?xf32, #CCCC>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+ %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
+ strides = dense<2> : tensor<2xi64>}
+ ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>)
+ outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+ return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @entry() {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c3 = arith.constant 3 : index
+ %c6 = arith.constant 6 : index
+ %c9 = arith.constant 9 : index
+ %f10 = arith.constant 10.00000e+00 : f32
+ %val = arith.constant 2.00000e+00 : f32
+ %zero = arith.constant 0.00000e+00 : f32
+
+ %filter2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+ %in2D_tmp = call @alloc_4d_filled_f32(%c3, %c9, %c9, %c3, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+ %in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c3, %c3, %c0] : tensor<?x?x?x?xf32>
+ %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+
+ %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc
+ : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CCCC>
+
+ %dense_ret = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+ %CCCC_ret = call @conv_2d_nhwc_hwcf_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+
+ // CHECK: ( ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 20 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
+ // CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
+ // CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ) )
+ %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero
+ : tensor<?x?x?x?xf32>, vector<3x3x3x1xf32>
+ vector.print %dense_v : vector<3x3x3x1xf32>
+
+ // CHECK: ( ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 20 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
+ // CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
+ // CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ) )
+ %v1 = vector.transfer_read %CCCC_ret[%c0, %c0, %c0, %c0], %zero
+ : tensor<?x?x?x?xf32>, vector<3x3x3x1xf32>
+ vector.print %v1 : vector<3x3x3x1xf32>
+
+ // Free the resources
+ bufferization.dealloc_tensor %in2D_nhwc : tensor<?x?x?x?xf32>
+ bufferization.dealloc_tensor %filter2D_nhwc : tensor<?x?x?x?xf32>
+ bufferization.dealloc_tensor %out2D_nhwc : tensor<?x?x?x?xf32>
+
+ bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor<?x?x?x?xf32, #CCCC>
+ return
+}
More information about the Mlir-commits
mailing list