[Mlir-commits] [mlir] db7f639 - [mlir][sparse] fix a crash when generating sparse convolution with nchw input
Peiming Liu
llvmlistbot at llvm.org
Tue May 30 13:16:59 PDT 2023
Author: Peiming Liu
Date: 2023-05-30T20:16:54Z
New Revision: db7f639b900dca266ea9f47c934418af0a67122b
URL: https://github.com/llvm/llvm-project/commit/db7f639b900dca266ea9f47c934418af0a67122b
DIFF: https://github.com/llvm/llvm-project/commit/db7f639b900dca266ea9f47c934418af0a67122b.diff
LOG: [mlir][sparse] fix a crash when generating sparse convolution with nchw input
Reviewed By: aartbik
Differential Revision: https://reviews.llvm.org/D151744
Added:
mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
Modified:
mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index 9c2465d25737d..1b711992a30d5 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -515,6 +515,15 @@ static bool topSortOptimal(CodegenEnv &env,
return env.topSortSize() == numLoops;
}
+static void addIterOrdering(LoopId f, LoopId t,
+ std::vector<std::vector<bool>> &adjM,
+ std::vector<unsigned> &inDegree) {
+ if (!adjM[f][t] && f != t) {
+ adjM[f][t] = true;
+ inDegree[t]++;
+ }
+}
+
/// Helper method to add all constraints from the indices in one affine
/// expression before all indices in the other affine expression. For
/// example i0+i1 < i2+i3+1 yields i0<i2, i0<i3, i1<i2, and i1<i3.
@@ -533,10 +542,7 @@ static void addAffineOrderings(std::vector<std::vector<bool>> &adjM,
// Recursion leaf.
assert(fidx && tidx);
const LoopId f = *fidx, t = *tidx;
- if (!adjM[f][t]) {
- adjM[f][t] = true;
- inDegree[t]++;
- }
+ addIterOrdering(f, t, adjM, inDegree);
return;
}
// Picks an affine expression and expand (recurse into) it.
@@ -693,6 +699,18 @@ static void addSliceBasedConstraints(CodegenEnv &env, OpOperand &t,
const AffineExpr fa = map.getResult(toOrigDim(enc, lvl - 1));
const AffineExpr ta = map.getResult(toOrigDim(enc, lvl));
+ if (auto fdim = fa.dyn_cast<AffineDimExpr>()) {
+ AffineDimCollector tCollector;
+ tCollector.walkPostOrder(ta);
+
+ const LoopId f = env.makeLoopId(fdim.getPosition());
+ for (auto td : tCollector.dims) {
+ const LoopId t = env.makeLoopId(td.getPosition());
+ addIterOrdering(f, t, adjM, inDegree);
+ }
+ continue;
+ }
+
// This is a heuristic, we pick an abitrary reduction loop from lhs and
// rhs and use them as d_x and d_y.
finder.walkPostOrder(fa);
@@ -704,10 +722,7 @@ static void addSliceBasedConstraints(CodegenEnv &env, OpOperand &t,
const LoopId tldx = env.makeLoopId(texp.getPosition());
// d_x > d_y
- if (!adjM[fldx][tldx]) {
- adjM[fldx][tldx] = true;
- inDegree[tldx]++;
- }
+ addIterOrdering(fldx, tldx, adjM, inDegree);
AffineDimCollector fCollector;
fCollector.walkPostOrder(fa);
@@ -717,21 +732,11 @@ static void addSliceBasedConstraints(CodegenEnv &env, OpOperand &t,
// make sure dx and dy is the last;
for (auto fd : fCollector.dims) {
const LoopId f = env.makeLoopId(fd.getPosition());
- if (f == fldx)
- continue;
- if (!adjM[f][fldx]) {
- adjM[f][fldx] = true;
- inDegree[fldx]++;
- }
+ addIterOrdering(f, fldx, adjM, inDegree);
}
for (auto td : tCollector.dims) {
const LoopId t = env.makeLoopId(td.getPosition());
- if (t == tldx)
- continue;
- if (!adjM[t][tldx]) {
- adjM[t][tldx] = true;
- inDegree[tldx]++;
- }
+ addIterOrdering(t, tldx, adjM, inDegree);
}
// Since we only support affine addition, the order between two dim
// expression does not really matters.
@@ -746,15 +751,11 @@ static void addSliceBasedConstraints(CodegenEnv &env, OpOperand &t,
const LoopId f = env.makeLoopId(fd.getPosition());
if (f == fldx) // skip d_x
continue;
-
for (auto td : tCollector.dims) {
const LoopId t = env.makeLoopId(td.getPosition());
if (t == tldx) // skip d_y
continue;
- if (!adjM[f][t]) {
- adjM[f][t] = true;
- inDegree[t]++;
- }
+ addIterOrdering(f, t, adjM, inDegree);
}
}
}
@@ -797,8 +798,7 @@ static bool computeIterationGraph(CodegenEnv &env, SortMask mask,
isSingletonDLT(dltI)) {
for (LoopId j = 0; j < numLoops; j++)
if (isUndefDLT(env.dlt(tid, j))) {
- adjM[i][j] = true;
- inDegree[j]++;
+ addIterOrdering(i, j, adjM, inDegree);
}
} else {
assert(isDenseDLT(dltI) || isUndefDLT(dltI));
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
new file mode 100644
index 0000000000000..1d71990e55b32
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
@@ -0,0 +1,178 @@
+// DEFINE: %{option} = "enable-runtime-library=true enable-index-reduction=true"
+// DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
+// DEFINE: %{run} = mlir-cpu-runner \
+// DEFINE: -e entry -entry-point-result=void \
+// DEFINE: -shared-libs=%mlir_c_runner_utils | \
+// DEFINE: FileCheck %s
+//
+// RUN: %{compile} | %{run}
+//
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true"
+// RUN: %{compile} | %{run}
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true"
+// RUN: %{compile} | %{run}
+
+// Do the same run, but now with direct IR generation and, if available, VLA
+// vectorization.
+// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA enable-index-reduction=true"
+// REDEFINE: %{run} = %lli_host_or_aarch64_cmd \
+// REDEFINE: --entry-function=entry_lli \
+// REDEFINE: --extra-module=%S/Inputs/main_for_lli.ll \
+// REDEFINE: %VLA_ARCH_ATTR_OPTIONS \
+// REDEFINE: --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \
+// REDEFINE: FileCheck %s
+// RUN: %{compile} | mlir-translate -mlir-to-llvmir | %{run}
+
+
+// TODO: we can only support dense output for nchw input because 'c' is a reduction loop
+
+
+#CCCD = #sparse_tensor.encoding<{
+ lvlTypes = [ "dense", "dense", "dense", "compressed" ]
+}>
+
+
+#CCCC = #sparse_tensor.encoding<{
+ lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ]
+}>
+
+// FIXME: CDCD encoding crashes!
+
+// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
+func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
+ %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor<?x?x?x?xf32>
+ %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+ return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nchw_fchw(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+ %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>,
+ strides = dense<1> : tensor<2xi64>}
+ ins (%arg0, %arg1: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
+ outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+ return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nchw_fchw_CCCD(%arg0: tensor<?x?x?x?xf32, #CCCD>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+ %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>,
+ strides = dense<1> : tensor<2xi64>}
+ ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CCCD>, tensor<?x?x?x?xf32>)
+ outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+ return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nchw_fchw_CCCC(%arg0: tensor<?x?x?x?xf32, #CCCC>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+ %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>,
+ strides = dense<1> : tensor<2xi64>}
+ ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>)
+ outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+ return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @entry() {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c3 = arith.constant 3 : index
+ %c6 = arith.constant 6 : index
+ %c8 = arith.constant 8 : index
+ %f10 = arith.constant 10.00000e+00 : f32
+ %val = arith.constant 2.00000e+00 : f32
+ %zero = arith.constant 0.00000e+00 : f32
+
+ %filter2D_nhwc = call @alloc_4d_filled_f32(%c1, %c3, %c3, %c3, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+ %in2D_tmp = call @alloc_4d_filled_f32(%c3, %c3, %c8, %c8, %val) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+ %in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c0, %c0, %c3] : tensor<?x?x?x?xf32>
+ %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+ %out2D_nhwc_CCCD = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+ %out2D_nhwc_CCCC = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+
+ %in2D_nhwc_CCCD = sparse_tensor.convert %in2D_nhwc
+ : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CCCD>
+ %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc
+ : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CCCC>
+
+ %dense_ret = call @conv_2d_nchw_fchw(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+ %CCCC_ret = call @conv_2d_nchw_fchw_CCCD(%in2D_nhwc_CCCD, %filter2D_nhwc, %out2D_nhwc_CCCD) : (tensor<?x?x?x?xf32, #CCCD>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+ %CDCD_ret = call @conv_2d_nchw_fchw_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc, %out2D_nhwc_CCCC) : (tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+
+
+ // CHECK: ( ( ( ( 108, 124, 124, 124, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ),
+ // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ),
+ // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) )
+ %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero
+ : tensor<?x?x?x?xf32>, vector<3x1x6x6xf32>
+ vector.print %dense_v : vector<3x1x6x6xf32>
+
+ // CHECK: ( ( ( ( 108, 124, 124, 124, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ),
+ // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ),
+ // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) )
+ %v1 = vector.transfer_read %CCCC_ret[%c0, %c0, %c0, %c0], %zero
+ : tensor<?x?x?x?xf32>, vector<3x1x6x6xf32>
+ vector.print %v1 : vector<3x1x6x6xf32>
+
+ // CHECK: ( ( ( ( 108, 124, 124, 124, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ),
+ // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ),
+ // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ),
+ // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) )
+ %v2 = vector.transfer_read %CDCD_ret[%c0, %c0, %c0, %c0], %zero
+ : tensor<?x?x?x?xf32>, vector<3x1x6x6xf32>
+ vector.print %v2 : vector<3x1x6x6xf32>
+
+ // Free the resources
+ bufferization.dealloc_tensor %in2D_nhwc : tensor<?x?x?x?xf32>
+ bufferization.dealloc_tensor %filter2D_nhwc : tensor<?x?x?x?xf32>
+ bufferization.dealloc_tensor %out2D_nhwc : tensor<?x?x?x?xf32>
+ bufferization.dealloc_tensor %out2D_nhwc_CCCD : tensor<?x?x?x?xf32>
+ bufferization.dealloc_tensor %out2D_nhwc_CCCC : tensor<?x?x?x?xf32>
+
+ bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor<?x?x?x?xf32, #CCCC>
+ bufferization.dealloc_tensor %in2D_nhwc_CCCD : tensor<?x?x?x?xf32, #CCCD>
+ return
+}
More information about the Mlir-commits
mailing list