[Mlir-commits] [mlir] a1bc979 - [mlir][Bufferization] Do not have read semantics for destination of `tensor.parallel_insert_slice`. (#134169)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Apr 3 09:47:39 PDT 2025
Author: MaheshRavishankar
Date: 2025-04-03T09:47:36-07:00
New Revision: a1bc979aa854c600e64e7500f1b79cd1d2655eb4
URL: https://github.com/llvm/llvm-project/commit/a1bc979aa854c600e64e7500f1b79cd1d2655eb4
DIFF: https://github.com/llvm/llvm-project/commit/a1bc979aa854c600e64e7500f1b79cd1d2655eb4.diff
LOG: [mlir][Bufferization] Do not have read semantics for destination of `tensor.parallel_insert_slice`. (#134169)
`tensor.insert_slice` needs to have read semantics on its destination
operand. Since it has a return value, its semantics are
- Copy dest to result
- Copy source to subview of destination.
`tensor.parallel_insert_slice` though has no result. So it does not need
to have read semantics. The op description
[here](https://github.com/llvm/llvm-project/blob/a3ac318e5f8668ec5b79dd86639881dfb2e88b69/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td#L1524)
also says that it is expected to lower to a `memref.subview`, that does
not have read semantics on the destination (its just a view).
This patch drops the read semantics for destination of
`tensor.parallel_insert_slice` but also makes the `shared_outs` operands
of `scf.forall` have read semantics. Earlier it would rely indirectly on
read semantics of destination operand of `tensor.parallel_insert_slice`
to propagate the read semantics for `shared_outs`. Now that is specified
more directly.
Fixes #133964
---------
Signed-off-by: MaheshRavishankar <mahesh.ravishankar at gmail.com>
Added:
Modified:
mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
mlir/test/Dialect/SCF/one-shot-bufferize.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index f48d2a2df9c3c..cf62ee8bc45b5 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1186,18 +1186,6 @@ struct YieldOpInterface
}
};
-/// Return `true` if the given loop may have 0 iterations.
-bool mayHaveZeroIterations(scf::ForallOp forallOp) {
- for (auto [lb, ub] : llvm::zip(forallOp.getMixedLowerBound(),
- forallOp.getMixedUpperBound())) {
- std::optional<int64_t> lbConst = getConstantIntValue(lb);
- std::optional<int64_t> ubConst = getConstantIntValue(ub);
- if (!lbConst.has_value() || !ubConst.has_value() || *lbConst >= *ubConst)
- return true;
- }
- return false;
-}
-
/// Bufferization of ForallOp. This also bufferizes the terminator of the
/// region. There are op interfaces for the terminators (InParallelOp
/// and ParallelInsertSliceOp), but these are only used during analysis. Not
@@ -1207,17 +1195,11 @@ struct ForallOpInterface
ForallOp> {
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
- auto forallOp = cast<ForallOp>(op);
-
- // If the loop has zero iterations, the results of the op are their
- // corresponding shared_outs, meaning that the shared_outs bufferize to a
- // read.
- if (mayHaveZeroIterations(forallOp))
- return true;
-
- // scf::ForallOp alone doesn't bufferize to a memory read, one of the
- // uses of its matching bbArg may.
- return state.isValueRead(forallOp.getTiedBlockArgument(&opOperand));
+ // All tensor operands to `scf.forall` are `shared_outs` and all
+ // shared outs are assumed to be read by the loop. This does not
+ // account for the case where the entire value is over-written,
+ // but being conservative here.
+ return true;
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index 4ac6eca586961..31014172a9555 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -930,8 +930,7 @@ struct ParallelInsertSliceOpInterface
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
- return insertSliceOpRequiresRead(cast<tensor::ParallelInsertSliceOp>(op),
- opOperand);
+ return opOperand == cast<ParallelInsertSliceOp>(op).getSourceMutable();
}
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
index bb9f7dfdba83f..a1067ec3ba05f 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -946,3 +946,38 @@ func.func @index_switch(%pred: index, %b: tensor<5xf32>, %c: tensor<5xf32>) -> t
// CHECK: return %[[r]]
return %0 : tensor<5xf32>
}
+
+// -----
+
+// See Issue https://github.com/llvm/llvm-project/issues/133964 . Checks that
+// tensor.parallel_insert_slice dest operand does not have read semantics.
+func.func @check_scfforall_inplace_bufferizer(%arg0 : tensor<?x?xf32>,
+ %arg1 : tensor<?x?xf32>,
+ %arg2 : tensor<?xf32> {bufferization.writable = true}) -> tensor<?xf32> {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %d0 = tensor.dim %arg2, %c0 : tensor<?xf32>
+ %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+ %0 = scf.forall (%arg3) in (%c1) shared_outs(%arg4 = %arg2) -> (tensor<?xf32>) {
+ %1 = tensor.extract_slice %arg0[0, 0][%d0, %d1][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+ %2 = tensor.extract_slice %arg1[0, 0][%d0, %d1][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+ %3 = linalg.generic {
+ indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+ affine_map<(d0, d1) -> (d0, d1)>,
+ affine_map<(d0, d1) -> (d0)>],
+ iterator_types = ["parallel", "reduction"]}
+ ins(%1, %2 : tensor<?x?xf32>, tensor<?x?xf32>)
+ outs(%arg4 : tensor<?xf32>) {
+ ^bb0(%b0 : f32, %b1: f32, %b2 : f32):
+ %4 = arith.mulf %b0, %b1 : f32
+ %5 = arith.addf %4, %b2 : f32
+ linalg.yield %5 : f32
+ } -> tensor<?xf32>
+ scf.forall.in_parallel {
+ tensor.parallel_insert_slice %3 into %arg4[0] [%d0] [1] : tensor<?xf32> into tensor<?xf32>
+ }
+ }
+ return %0 : tensor<?xf32>
+}
+// CHECK-LABEL: func @check_scfforall_inplace_bufferizer
+// CHECK-NOT: memref.alloc
More information about the Mlir-commits
mailing list