[Mlir-commits] [mlir] 55c274d - [mlir][Linalg] Drop comprehensive-func-bufferize (12/n)
Nicolas Vasilache
llvmlistbot at llvm.org
Thu Jul 1 04:40:38 PDT 2021
Author: Nicolas Vasilache
Date: 2021-07-01T11:36:24Z
New Revision: 55c274d7d30eb4de129a70bf48a063e740b71c9c
URL: https://github.com/llvm/llvm-project/commit/55c274d7d30eb4de129a70bf48a063e740b71c9c
DIFF: https://github.com/llvm/llvm-project/commit/55c274d7d30eb4de129a70bf48a063e740b71c9c.diff
LOG: [mlir][Linalg] Drop comprehensive-func-bufferize (12/n)
This revision drops the comprehensive bufferization Function pass, which has issues when trying to bufferize constants.
Instead, only support the comprehensive-module-bufferize by default.
Differential Revision: https://reviews.llvm.org/D105228
Added:
Modified:
mlir/include/mlir/Dialect/Linalg/Passes.h
mlir/include/mlir/Dialect/Linalg/Passes.td
mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
Removed:
mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir
mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir
mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir
################################################################################
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
index d80eb9a0652de..27bb50d5a2f2c 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.h
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -56,18 +56,13 @@ std::unique_ptr<OperationPass<FuncOp>> createConvertLinalgToParallelLoopsPass();
/// Placeholder for now, this is NYI.
std::unique_ptr<OperationPass<FuncOp>> createConvertLinalgToAffineLoopsPass();
-/// Create a pass that bufferizes the body of a FuncOp and tries to reuse the
-/// buffers for those arguments that:
-/// a) have been annotated 'inplaceable' and
-/// b) whose buffer uses would be free of memory hazards.
-std::unique_ptr<Pass> createLinalgComprehensiveFuncBufferizePass();
-
/// This pass implements a cross-dialect bufferization approach and performs an
/// analysis to determine which op operands and results may be bufferized in the
/// same buffers. The analysis is performed on topologically sorted CallOp and
/// FuncOp within a module. It provides analyses and bufferization across
-/// function boundaries. Within a single function body, the bufferization used
-/// is that provided by `LinalgComprehensiveFuncBufferizePass`.
+/// function boundaries. Within a function boundary, the analysis is performed
+/// on SSA use-def chains starting from function operands that are annotated
+/// with the 'inplaceable' attribute.
std::unique_ptr<Pass> createLinalgComprehensiveModuleBufferizePass();
/// Create a pass to convert Linalg operations which work on tensors to use
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 3d9833061a090..c638294b12109 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -22,26 +22,6 @@ def ConvertElementwiseToLinalg : FunctionPass<"convert-elementwise-to-linalg"> {
let dependentDialects = ["linalg::LinalgDialect", "memref::MemRefDialect"];
}
-def LinalgComprehensiveFuncBufferize :
- FunctionPass<"linalg-comprehensive-func-bufferize"> {
- let summary = "Bufferize (tensor into memref) the body of a FuncOp and try "
- "to reuse the buffers for those arguments that "
- "a) have been annotated 'inplaceable' and "
- "b) whose buffer uses would be free of memory hazards";
- let description = [{
- This pass implements a cross-dialect bufferization approach and performs an
- analysis to determine which op operands and results may be bufferized in the
- same buffers. The analysis is performed on SSA use-def chains starting from
- function operands that are annotated with the 'inplaceable' attribute.
- }];
- let options = [
- Option<"testAnalysisOnly", "test-analysis-only", "bool",
- /*default=*/"false",
- "Only runs inplaceability analysis (for testing purposes only)">
- ];
- let constructor = "mlir::createLinalgComprehensiveFuncBufferizePass()";
-}
-
def LinalgComprehensiveModuleBufferize :
Pass<"linalg-comprehensive-module-bufferize", "ModuleOp"> {
let summary = "Bufferize (tensor into memref) for a Module.";
@@ -50,8 +30,9 @@ def LinalgComprehensiveModuleBufferize :
analysis to determine which op operands and results may be bufferized in the
same buffers. The analysis is performed on topologically sorted CallOp and
FuncOp within a module. It provides analyses and bufferization across
- function boundaries. Within a single function body, the bufferization used
- is that provided by `-linalg-comprehensive-func-bufferize`.
+ function boundaries. Within a function boundary, the analysis is performed
+ on SSA use-def chains starting from function operands that are annotated
+ with the 'inplaceable' attribute.
}];
let options = [
Option<"testAnalysisOnly", "test-analysis-only", "bool",
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
index 03191a85e506c..dec08dfd4da2c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
@@ -979,10 +979,10 @@ bool BufferizationAliasInfo::isSourceEquivalentToAMatchingExtractSliceOp(
/// Apply `fun` to all the members of the equivalence class of `v`.
void BufferizationAliasInfo::applyOnEquivalenceClass(
Value v, function_ref<void(Value)> fun) const {
- for (auto it = equivalentInfo.findLeader(v),
- eit = equivalentInfo.member_end();
- it != eit; ++it) {
- fun(v);
+ auto leaderIt = equivalentInfo.findLeader(v);
+ for (auto mit = leaderIt, meit = equivalentInfo.member_end(); mit != meit;
+ ++mit) {
+ fun(mit->v);
}
}
@@ -1485,9 +1485,8 @@ bufferize(OpBuilder &b, CallOpInterface callOp, BlockAndValueMapping &bvm,
getEquivalentEnclosingFuncBBArg(returnVal, aliasInfo)) {
Value oldRes = callOp->getResult(returnOperand.getOperandNumber());
int64_t idx = bbArg.getArgNumber();
- Value buffer = bvm.lookupOrNull(callOp->getOperand(idx));
- if (!buffer)
- return callOp->emitError() << "operand #" << idx << " not bufferized";
+ Value buffer = lookup(bvm, callOp->getOperand(idx));
+ assert(buffer && "expected bufferized value");
// Add CallOp operand/result equivalence: this is interprocedural info.
aliasInfo.insertNewBufferEquivalence(oldRes, buffer);
map(bvm, oldRes, buffer);
@@ -1504,11 +1503,11 @@ bufferize(OpBuilder &b, CallOpInterface callOp, BlockAndValueMapping &bvm,
continue;
}
- // TODO: Need to hoist above function boundary and add to
- // `hoistedArgumentTypes`.
- if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo))
- return allocOp->emitError()
- << " needs hoist across function boundary\n";
+ // TODO: Need to hoist above function boundary.
+ if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo)) {
+ hoistedArguments.push_back(allocOp->getResult(0));
+ continue;
+ }
// Other cases legitimately need to return a tensor, this is currently not
// supported. For instance, if hoisting across function boundary has
@@ -1518,13 +1517,14 @@ bufferize(OpBuilder &b, CallOpInterface callOp, BlockAndValueMapping &bvm,
int64_t returnIdx = returnOperand.getOperandNumber();
return returnOp->emitError()
- << " bufferize result #" << returnIdx << "\n";
+ << "buffer result #" << returnIdx << " not produced by an alloc\n";
}
}
// 2. Compute bufferized FunctionType.
SmallVector<Type> argumentTypes{callOp->getOperandTypes()};
- llvm::append_range(argumentTypes, ValueRange{hoistedArguments}.getTypes());
+ ValueRange hoistedArgs{hoistedArguments};
+ llvm::append_range(argumentTypes, hoistedArgs.getTypes());
// Get the bufferized FunctionType for funcOp or construct it if not yet
// available.
FunctionType bufferizedFuncType = getOrCreateBufferizedFunctionType(
@@ -1543,8 +1543,8 @@ bufferize(OpBuilder &b, CallOpInterface callOp, BlockAndValueMapping &bvm,
// Tensor operands are guaranteed to have been buferized.
int64_t idx = opOperand.getOperandNumber();
- Value buffer = bvm.lookupOrNull(tensorOperand);
- assert(buffer && " missing buffer for operand");
+ Value buffer = lookup(bvm, tensorOperand);
+ assert(buffer && "expected bufferized value");
// Caller / callee type mistmatch is handled with a CastOp.
auto memRefType = bufferizedFuncType.getInput(idx);
@@ -1592,7 +1592,7 @@ static LogicalResult bufferize(OpBuilder &b, tensor::CastOp castOp,
? rankedMemRefType.getAffineMaps()
: ArrayRef<AffineMap>{};
Type memRefType = getContiguousOrUnrankedMemRefType(
- castOp.getResult().getType(), {}, memorySpace);
+ castOp.getResult().getType(), affineMaps, memorySpace);
Value res = b.create<memref::CastOp>(castOp.getLoc(), memRefType,
lookup(bvm, castOp.source()));
aliasInfo.insertNewBufferEquivalence(res, castOp.getResult());
@@ -2176,64 +2176,21 @@ static LogicalResult bufferizeFuncOpInternals(
return failure(result.wasInterrupted());
}
-namespace {
-struct LinalgComprehensiveFuncBufferize
- : public LinalgComprehensiveFuncBufferizeBase<
- LinalgComprehensiveFuncBufferize> {
- void runOnFunction() override;
-
- void getDependentDialects(DialectRegistry ®istry) const override {
- registry.insert<linalg::LinalgDialect, memref::MemRefDialect>();
- }
-};
-} // end namespace
-
-void LinalgComprehensiveFuncBufferize::runOnFunction() {
- auto funcOp = getFunction();
-
- // Analysis phase.
- DominanceInfo domInfo(funcOp);
- BufferizationAliasInfo aliasInfo(funcOp);
- // If the analysis fails, just return. This is expected to reset the IR and no
- // single OpResult should be marked inPlace.
- if (failed(inPlaceAnalysisFuncOpBody(funcOp, aliasInfo, domInfo))) {
- signalPassFailure();
- return;
- }
-
- if (testAnalysisOnly)
- return;
-
- // Bufferization phase.
- BlockAndValueMapping bvm;
- DenseMap<FuncOp, FunctionType> bufferizedFunctionTypes;
- if (failed(bufferizeFuncOpInternals(funcOp, bvm, aliasInfo,
- bufferizedFunctionTypes)))
- signalPassFailure();
-
- // Post-pass cleanup of inplaceable attributes.
- funcOp.walk([&](Operation *op) { op->removeAttr(kInPlaceResultsAttrName); });
-}
-
-std::unique_ptr<Pass> mlir::createLinalgComprehensiveFuncBufferizePass() {
- return std::make_unique<LinalgComprehensiveFuncBufferize>();
-}
-
//===----------------------------------------------------------------------===//
// Bufferization entry-point for modules.
//===----------------------------------------------------------------------===//
-/// Return the op with Allocate MemoryEffect if `v` is equivalent to an such
+/// Return the op with Allocate MemoryEffect if `v` is equivalent to such an
/// an op. Return null otherwise.
static Operation *getEquivalentAlloc(Value value,
const BufferizationAliasInfo &aliasInfo) {
- Operation *res;
+ Operation *res = nullptr;
aliasInfo.applyOnEquivalenceClass(value, [&](Value v) {
if (!res)
if (auto interface =
dyn_cast_or_null<MemoryEffectOpInterface>(v.getDefiningOp()))
if (auto effect =
- interface.getEffectOnValue<MemoryEffects::Allocate>(value))
+ interface.getEffectOnValue<MemoryEffects::Allocate>(v))
res = v.getDefiningOp();
});
return res;
@@ -2249,9 +2206,12 @@ getEquivalentEnclosingFuncBBArg(Value v,
if (!funcOp)
funcOp = op->getParentOfType<FuncOp>();
assert(funcOp && "expected non-null FuncOp");
- for (BlockArgument bbArg : funcOp.getArguments())
+ for (BlockArgument bbArg : funcOp.getArguments()) {
+ if (!bbArg.getType().isa<RankedTensorType>())
+ continue;
if (aliasInfo.areEquivalentBufferizedValues(v, bbArg))
return bbArg;
+ }
return nullptr;
}
@@ -2292,9 +2252,6 @@ static LogicalResult bufferizeFuncOpBoundary(
// externally).
// -> Figure out a better layering.
TypeRange resultTypes;
- FunctionType bufferizedFuncType =
- getOrCreateBufferizedFunctionType(funcOp, funcOp.getType().getInputs(),
- resultTypes, bufferizedFunctionTypes);
// Corner case: Bodiless FuncOp
// ============================
@@ -2305,6 +2262,9 @@ static LogicalResult bufferizeFuncOpBoundary(
if (llvm::any_of(funcOp.getType().getResults(), isaTensor))
return funcOp->emitError() << "cannot bufferize bodiless function that "
<< "returns a tensor";
+ FunctionType bufferizedFuncType =
+ getOrCreateBufferizedFunctionType(funcOp, funcOp.getType().getInputs(),
+ TypeRange{}, bufferizedFunctionTypes);
funcOp.setType(bufferizedFuncType);
LLVM_DEBUG(DBGS() << "End bufferizeFuncOpBoundary no fun body: " << funcOp);
return success();
@@ -2323,16 +2283,29 @@ static LogicalResult bufferizeFuncOpBoundary(
Value returnVal = returnOperand.get();
if (getEquivalentEnclosingFuncBBArg(returnVal, aliasInfo))
continue;
- // TODO: Need to hoist above function boundary. If this is not possible due
- // to data-depedent sizes, we need a better type than memref.
- if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo))
- return allocOp->emitError() << " needs hoist across function boundary\n";
+
+ // TODO: Need to hoist above function boundary.
+ if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo)) {
+ returnValues.push_back(allocOp->getResult(0));
+ continue;
+ }
+
+ // Other cases legitimately need to return a tensor, this is currently not
+ // supported. For instance, if hoisting across function boundary has
+ // failed, it may be due to e.g. data-dependent sizes. In such a case, we
+ // would need a better type than memref.
int64_t returnIdx = returnOperand.getOperandNumber();
- return returnOp->emitError() << " bufferize result #" << returnIdx << "\n";
+ return returnOp->emitError()
+ << "buffer result #" << returnIdx << " not produced by an alloc\n";
}
// 2. Rewrite the terminator without the inPlace bufferizable values.
- OpBuilder(returnOp).create<ReturnOp>(returnOp.getLoc(), returnValues);
+ ValueRange retValues{returnValues};
+ FunctionType bufferizedFuncType = getOrCreateBufferizedFunctionType(
+ funcOp, funcOp.getType().getInputs(), retValues.getTypes(),
+ bufferizedFunctionTypes);
+ OpBuilder b(returnOp);
+ b.create<ReturnOp>(returnOp.getLoc(), returnValues);
returnOp->erase();
// 3. Rewrite the bbArgs.
diff --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir
deleted file mode 100644
index 41e698f97c873..0000000000000
--- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: mlir-opt %s -linalg-comprehensive-func-bufferize=test-analysis-only -split-input-file -verify-diagnostics
-
-// -----
-
-func @scf_for(%A : tensor<?xf32>,
- %B : tensor<?xf32> {linalg.inplaceable = true},
- %C : tensor<4xf32>,
- %lb : index, %ub : index, %step : index)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
- -> (tensor<?xf32>, tensor<?xf32>)
- {
- %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
- %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // Throw a wrench in the system by swapping yielded values: this result in a
- // ping-pong of values at each iteration on which we currently want to fail.
-
- // expected-error @+1 {{Yield operand #1 does not bufferize to an equivalent buffer}}
- scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
- }
-
- return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
-}
-
diff --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir
deleted file mode 100644
index 5234d85b0b5b1..0000000000000
--- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir
+++ /dev/null
@@ -1,474 +0,0 @@
-// RUN: mlir-opt %s -linalg-comprehensive-func-bufferize=test-analysis-only -split-input-file | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// Simple cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_fun
-func @extract_slice_fun(%A : tensor<?xf32>, %B : tensor<?xf32> {linalg.inplaceable = true})
- -> (tensor<4xf32>, tensor<8xf32>)
-{
- // tensor.extract_slice is not used in a write, it is not compelled to
- // bufferize out of place. Let callers decide whether they want to create
- // aliasing subviews at all call sites or whether they allocate.
- // This is true irrespective of whether the function argument is inplaceable.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %r1 = tensor.extract_slice %B[0][8][1] : tensor<?xf32> to tensor<8xf32>
-
- return %r0, %r1: tensor<4xf32>, tensor<8xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun
-func @insert_slice_fun(
- %A : tensor<?xf32>,
- %B : tensor<?xf32> {linalg.inplaceable = true},
- %C : tensor<4xf32>)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- // must bufferize out of place.
- // CHECK: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- %r0 = tensor.insert_slice %C into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // bufferizes inplace.
- // CHECK: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %r1 = tensor.insert_slice %C into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @conflict_on_B
-func @conflict_on_B(
- %A : tensor<4x4xf32> {linalg.inplaceable = true},
- %B : tensor<4x4xf32> {linalg.inplaceable = true})
- -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>)
-{
- // matmul output operand interferes with input operand.
- // CHECK: linalg.matmul
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- %C = linalg.matmul ins(%A, %B: tensor<4x4xf32>, tensor<4x4xf32>)
- outs(%B: tensor<4x4xf32>)
- -> tensor<4x4xf32>
-
- // matmul output operand interferes with input operand.
- // CHECK: linalg.matmul
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- %D = linalg.matmul ins(%B, %A: tensor<4x4xf32>, tensor<4x4xf32>)
- outs(%B: tensor<4x4xf32>)
- -> tensor<4x4xf32>
-
- // matmul output operand does not interferes with input operand.
- // CHECK: linalg.matmul
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
- outs(%B: tensor<4x4xf32>)
- -> tensor<4x4xf32>
-
- return %C, %D, %E: tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Length-1 producer-consumer cases.
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_extract_slice
-func @extract_slice_extract_slice(
- %A : tensor<?xf32> {linalg.inplaceable = true}, %B : tensor<?xf32>)
- -> (tensor<2xf32>, tensor<2xf32>)
-{
- // tensor.extract_slice is not used in a write, it is not compelled to
- // bufferize out of place. Let callers decide whether they want to create
- // aliasing subviews at all call sites or whether they allocate.
- // This is true irrespective of whether the function argument is inplaceable.
- // CHECK: {__inplace_results_attr__ = ["true"]}
- %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
- // CHECK: {__inplace_results_attr__ = ["true"]}
- %r1 = tensor.extract_slice %r0[0][2][1] : tensor<4xf32> to tensor<2xf32>
-
- // CHECK: {__inplace_results_attr__ = ["true"]}
- %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
- // CHECK: {__inplace_results_attr__ = ["true"]}
- %r3 = tensor.extract_slice %r2[0][2][1] : tensor<4xf32> to tensor<2xf32>
-
- return %r1, %r3: tensor<2xf32>, tensor<2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_insert_slice
-func @insert_slice_insert_slice(
- %A : tensor<?xf32> {linalg.inplaceable = true},
- %A2 : tensor<4xf32> {linalg.inplaceable = true},
- %A3 : tensor<2xf32> {linalg.inplaceable = true},
- %B : tensor<?xf32>, %B2 : tensor<4xf32>, %B3 : tensor<2xf32>)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- // CHECK: {__inplace_results_attr__ = ["true"]}
- %r0 = tensor.insert_slice %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32>
-
- // CHECK: {__inplace_results_attr__ = ["true"]}
- %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // CHECK: {__inplace_results_attr__ = ["false"]}
- %r2 = tensor.insert_slice %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32>
-
- // CHECK: {__inplace_results_attr__ = ["false"]}
- %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- return %r1, %r3: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_nonmatching_insert_slice
-func @extract_slice_nonmatching_insert_slice(
- %A : tensor<?xf32> {linalg.inplaceable = true},
- %B : tensor<?xf32>, %idx: index)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- // %r1 bufferizes inplace because %A is inplaceable.
- // %r0 is an overlapping tensor.extract_slice that does not match, it must be
- // out of place.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
- // %r1 can bufferize inplace fine.
- // CHECK: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // %r3 does bufferizes inplace because %B is not inplaceable.
- // %r0 is an overlapping tensor.extract_slice that does not match, but does
- // not alias with the buffer coming from %r3 so it can actually bufferize
- // inplace.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
- // %r3 cannot bufferize inplace since %B is not inplaceable.
- // CHECK: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- %r3 = tensor.insert_slice %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
-
- return %r1, %r3: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_matching_insert_slice
-func @extract_slice_matching_insert_slice(
- %A : tensor<?xf32> {linalg.inplaceable = true},
- %B : tensor<?xf32>)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- // %r1 bufferizes inplace because %A is inplaceable.
- // %r0 is a tensor.extract_slice that matches, it can also be bufferized
- // inplace.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
- // CHECK: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // %r2 is a tensor.extract_slice that matches %r3, it can be bufferized
- // inplace.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
- // tensor.insert_slice cannot bufferize inplace.
- // This should have been captured by a canonicalization pattern and it would
- // be unproductive to have special logic in bufferization to encode matching
- // insert_slice(extract_slice(A), A).
- // CHECK: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- return %r1, %r3: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_linalg_readonly_use
-func @extract_slice_linalg_readonly_use(
- %A : tensor<?x?xf32>,
- %B : tensor<4x4xf32>,
- %C : tensor<4x4xf32> {linalg.inplaceable = true})
- -> (tensor<4x4xf32>, tensor<4x4xf32>)
-{
- // tensor.extract_slice is only used as a read, no interference irrespective
- // of user's inplace status.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %sA = tensor.extract_slice %A[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
- // matmul output operand is not inplaceable at the function boundary.
- // CHECK: linalg.matmul
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- %D = linalg.matmul ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
- outs(%B: tensor<4x4xf32>)
- -> tensor<4x4xf32>
-
- // matmul output operand is inplaceable at the function boundary.
- // CHECK: linalg.matmul
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %E = linalg.matmul ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
- outs(%C: tensor<4x4xf32>)
- -> tensor<4x4xf32>
-
- return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_to_linalg_write_use
-func @extract_slice_to_linalg_write_use(
- %A : tensor<4x4xf32>,
- %B : tensor<?x?xf32>,
- %C : tensor<?x?xf32> {linalg.inplaceable = true})
- -> (tensor<4x4xf32>, tensor<4x4xf32>)
-{
- // Step 3. %sB forward propagates to a write in %D but it is not inplace.
- // So this is only ever read and can bufferize inplace.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
- // Step 2. %sB has a read interference in %E, it does not bufferize inplace.
- // CHECK: linalg.matmul
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- %D = linalg.matmul ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
- outs(%sB: tensor<4x4xf32>)
- -> tensor<4x4xf32>
-
- // Step 4. %sC forward propagates to an inplace write in %E.
- // %sC backward propagates to %C which is inplaceable.
- // As a consequence this is bufferized inplace.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
- // Step 1. %sC backprops to the tensor.extract_slice producer which is not
- // considered an interference. This bufferizes inplace.
- // CHECK: linalg.matmul
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %E = linalg.matmul ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>)
- outs(%sC: tensor<4x4xf32>)
- -> tensor<4x4xf32>
-
- return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Transitive cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_to_linalg_write_use
-func @extract_slice_to_linalg_write_use(
- %A : tensor<4x4xf32>,
- %B : tensor<?x?xf32>,
- %C : tensor<?x?xf32> {linalg.inplaceable = true})
- -> (tensor<4x4xf32>, tensor<4x4xf32>)
-{
- // Step 4. %sB forward propagates to an inplace write in %D.
- // %sB backward propagates to %B which is not inplaceable.
- // As a consequence this is bufferized out of place.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
- // Step 1. %sB backprops to the tensor.extract_slice producer which is not
- // considered an interference. This bufferizes inplace.
- // CHECK: linalg.matmul
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %D = linalg.matmul ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
- outs(%sB: tensor<4x4xf32>)
- -> tensor<4x4xf32>
-
- // Step 3. %sC forward propagates to an inplace write in %E.
- // %sC backward propagates to %C which is inplaceable.
- // As a consequence this is bufferized inplace.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
-
- // Step 1. %sC backprops to the tensor.extract_slice producer which is not
- // considered an interference. This bufferizes inplace.
- // CHECK: linalg.matmul
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
- outs(%sC: tensor<4x4xf32>)
- -> tensor<4x4xf32>
-
- return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @nested_extract_slice_and_insert
-func @nested_extract_slice_and_insert(
- %A : tensor<?x?xf32>,
- %B : tensor<?x?xf32> {linalg.inplaceable = true},
- %C : tensor<?x?xf32> {linalg.inplaceable = true},
- %idx : index)
- -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)
-{
- %f0 = constant 0.0 : f32
-
- // 2-level matching tensor.extract_slice / tensor.insert_slice into non
- // inplaceable %A.
- // - %rA is not inplaceable because %A is not inplaceable at function boundary.
- // - once %rA is deemed not inplaceable, nothing prevent %rsA to be inplaceable
- // - this propagates to %FA and %ssA being inplaceable.
- // - %sA would then bufferize to an inplace write (i.e. %FA) but %A is not
- // inplaceable and so %sA is not inplaceable.
- // CHECK: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- // CHECK-NEXT: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: fill
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
- %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
- %FA = linalg.fill(%f0, %ssA) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
- %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
- %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
- // 3-level matching tensor.extract_slice / tensor.insert_slice into
- // inplaceable %B.
- // CHECK-NEXT: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: tensor.extract_slice
- // Atm, this 2nd tensor.extract_slice fails to bufferize inplace because
- // clobbering analysis conservatively test for equivalent buffers.
- // TODO: This is currently too restrictive and misses clobberings.
- // When available, use container-containee analysis.
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- // CHECK-NEXT: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: fill
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
- %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
- %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32>
- %FB = linalg.fill(%f0, %sssB) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
- %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32>
- %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor<?x?xf32>
- %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
- // 2-level matching tensor.extract_slice / tensor.insert_slice into
- // inplaceable %C with a twist.
- // Throw a wrench in the system: %rsC production sizes do not match %ssC.
- // CHECK-NEXT: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // The tensor.insert_slice that would be candidate for matching does not actually
- // match. That tensor.insert_slice can still be bufferized inplace nonetheless
- // but this tensor.extract_slice, which bufferizes to an inplace write, cannot.
- // CHECK-NEXT: tensor.extract_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
- // CHECK-NEXT: fill
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
- %ssC = tensor.extract_slice %sC[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
- %FC = linalg.fill(%f0, %ssC) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
- %rsC = tensor.insert_slice %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
- %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
- return %rA, %rB, %rC: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Simple loop cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @scf_for_yield_only
-func @scf_for_yield_only(%A : tensor<?xf32>,
- %B : tensor<?xf32> {linalg.inplaceable = true},
- %lb : index, %ub : index, %step : index)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- // CHECK: scf.for
- // CHECK-NEXT: scf.yield
- // CHECK-NEXT: {__inplace_results_attr__ = ["false"]}
- %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
- scf.yield %t : tensor<?xf32>
- }
-
- // CHECK: scf.for
- // CHECK-NEXT: scf.yield
- // CHECK-NEXT: {__inplace_results_attr__ = ["true"]}
- %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
- scf.yield %t : tensor<?xf32>
- }
-
- return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
-func @scf_for_with_tensor.insert_slice(%A : tensor<?xf32>,
- %B : tensor<?xf32> {linalg.inplaceable = true},
- %C : tensor<4xf32>,
- %lb : index, %ub : index, %step : index)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- // CHECK: scf.for
- // scf.for bbArgs are always inplaceable seen from ops inside the body:
- // 1. Either the matching tensor is not inplaceable and an alloc occurs
- // which makes bbArg inplaceable.
- // 2. Or it is already inplaceable and so is bbArg.
- // CHECK-NEXT: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: tensor.insert_slice
- // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
- // CHECK-NEXT: scf.yield
- // CHECK-NEXT: {__inplace_results_attr__ = ["false", "true"]}
- %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
- -> (tensor<?xf32>, tensor<?xf32>)
- {
- %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
- %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
- scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
- }
-
- return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
-}
-
diff --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir
deleted file mode 100644
index e217a7062a94f..0000000000000
--- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir
+++ /dev/null
@@ -1,353 +0,0 @@
-// RUN: mlir-opt %s -linalg-comprehensive-func-bufferize -split-input-file | FileCheck %s
-
-// CHECK-DAG: #[[$map_2d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-// CHECK-LABEL: func @fill_inplace(
-// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: tensor<?xf32> {linalg.inplaceable = true})
-func @fill_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}) -> tensor<?xf32> {
- // CHECK: %[[I:.*]] = memref.buffer_cast %[[A]] : memref<?xf32, #[[$map_2d_dyn]]>
-
- // CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32
- %f0 = constant 0.0 : f32
-
- /// Inplaceable, no alloc
- // CHECK-NOT: alloc
- // CHECK: linalg.fill(%[[F0]], %[[I]]) : f32, memref<?xf32, #[[$map_2d_dyn]]>
- %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
-
- // CHECK: %[[R:.*]] = memref.tensor_load %[[I]] : memref<?xf32, #[[$map_2d_dyn]]>
- // CHECK: return %[[R]] : tensor<?xf32>
- return %r: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-DAG: #[[$map_2d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-/// No linalg.inplaceable flag, must allocate.
-// CHECK-LABEL: func @not_inplace(
-// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: tensor<?xf32>)
-func @not_inplace(%A : tensor<?xf32>) -> tensor<?xf32> {
- // CHECK: %[[I:.*]] = memref.buffer_cast %[[A]] : memref<?xf32, #[[$map_2d_dyn]]>
-
- // CHECK: %[[D0:.*]] = memref.dim %[[I]], {{.*}} : memref<?xf32, #[[$map_2d_dyn]]>
- // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) : memref<?xf32>
- // CHECK: %[[I2:.*]] = memref.cast %[[ALLOC]] : memref<?xf32> to memref<?xf32, #map>
-
- // CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32
- %f0 = constant 0.0 : f32
-
- // CHECK: linalg.fill(%[[F0]], %[[I2]]) : f32, memref<?xf32, #[[$map_2d_dyn]]>
- %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
-
- // CHECK: dealloc %[[ALLOC]] : memref<?xf32>
- // CHECK: %[[R:.*]] = memref.tensor_load %[[I2]] : memref<?xf32, #[[$map_2d_dyn]]>
- // CHECK: return %[[R]] : tensor<?xf32>
- return %r: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @not_inplace
-// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: tensor<?x?xf32>
-func @not_inplace(%A : tensor<?x?xf32> {linalg.inplaceable = true}) -> tensor<?x?xf32> {
- %f0 = constant 0.0 : f32
-
- // CHECK: %[[BUFFER_CAST:.*]] = memref.buffer_cast %[[A]] : memref<?x?xf32
-
- /// Cross-op multiple uses of %A, the first op which has interfering reads must alloc.
- // CHECK: %[[ALLOC:.*]] = memref.alloc
- // CHECK: %[[CAST:.*]] = memref.cast %[[ALLOC]]
- // CHECK: linalg.fill({{.*}}, %[[CAST]]
- %f = linalg.fill(%f0, %A) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
-
- /// The second op has no interfering reads and can reuse.
- // CHECK-NOT: alloc
- // CHECK: linalg.matmul{{.*}}outs(%[[BUFFER_CAST]]
- %r = linalg.matmul ins(%f, %f: tensor<?x?xf32>, tensor<?x?xf32>)
- outs(%A: tensor<?x?xf32>)
- -> tensor<?x?xf32>
- return %r: tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @not_inplace
-func @not_inplace(%A : tensor<?x?xf32> {linalg.inplaceable = true}) -> tensor<?x?xf32> {
- /// Within op multiple uses of %A, must alloc.
- // CHECK: alloc
- %r = linalg.matmul ins(%A, %A: tensor<?x?xf32>, tensor<?x?xf32>)
- outs(%A: tensor<?x?xf32>)
- -> tensor<?x?xf32>
- return %r: tensor<?x?xf32>
-}
-// -----
-
-// CHECK-LABEL: func @vec_inplace
-func @vec_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %vec : vector<4xf32>)
- -> tensor<?xf32>
-{
- %c0 = constant 0 : index
- // CHECK-NOT: alloc
- %r = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
- return %r: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @vec_not_inplace
-func @vec_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %vec : vector<4xf32>)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- %c0 = constant 0 : index
- %c1 = constant 1 : index
-
- // CHECK: %[[BUFFER_CAST:.*]] = memref.buffer_cast {{.*}} : memref<?xf32, #[[$map_2d_dyn]]>
-
- /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
- // CHECK: %[[ALLOC:.*]] = memref.alloc
- // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
- %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
-
- /// The second vector.transfer has no interfering reads and can reuse the buffer.
- // CHECK-NOT: alloc
- // CHECK-NEXT: vector.transfer_write {{.*}}, %[[BUFFER_CAST]]
- %r1 = vector.transfer_write %vec, %A[%c1] : vector<4xf32>, tensor<?xf32>
- return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun
-func @insert_slice_fun(%A0 : tensor<?xf32>, %A1 : tensor<?xf32> {linalg.inplaceable = true},
- %t0 : tensor<4xf32>, %t1 : tensor<4xf32> {linalg.inplaceable = true})
- -> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
-{
- // CHECK: %[[BUFFER_CAST_A0:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
- // CHECK: %[[BUFFER_CAST_A1:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
- // CHECK: %[[BUFFER_CAST_t0:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
- // CHECK: %[[BUFFER_CAST_t1:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
-
- // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
- // CHECK: %[[REALLOC_A0:.*]] = memref.alloc
- // CHECK: linalg.copy(%[[BUFFER_CAST_A0]]
- // CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC_A0]]
- // CHECK: linalg.copy(%[[BUFFER_CAST_t0]], %[[SV_A0]])
- %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
- // CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
- // CHECK: linalg.copy(%[[BUFFER_CAST_A0]]
- // CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]]
- // CHECK: linalg.copy(%[[BUFFER_CAST_t1]], %[[SV_A0_2]])
- %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice.
- // CHECK: %[[REALLOC_A1:.*]] = memref.alloc
- // CHECK: linalg.copy(%[[BUFFER_CAST_A1]]
- // CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]]
- // CHECK: linalg.copy(%[[BUFFER_CAST_t0]], %[[SV_A1]])
- %r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // Do not realloc the large tensor. Copy the tensor.extract_slice.
- // CHECK-NOT: alloc
- // CHECK: %[[SV_A1_2:.*]] = memref.subview %[[BUFFER_CAST_A1]]
- // CHECK: linalg.copy(%[[BUFFER_CAST_t1]], %[[SV_A1_2]])
- %r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- return %r0, %r1, %r2, %r3: tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun
-func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
- -> tensor<?xf32>
-{
- %f0 = constant 0.0 : f32
-
- // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
- // CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
-
- // CHECK-NOT: alloc
- // CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]]
- // CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]])
- %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- /// Overwrite BUFFER_CAST_A inplace.
- // CHECK: linalg.fill({{.*}}, %[[BUFFER_CAST_A]]
- %r1 = linalg.fill(%f0, %r0) : f32, tensor<?xf32> -> tensor<?xf32>
- return %r1: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun
-func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
- -> tensor<?xf32>
-{
- %f0 = constant 0.0 : f32
-
- // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
- // CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
-
- // CHECK: linalg.fill({{.*}}, %[[BUFFER_CAST_A]]
- %r0 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
-
- // CHECK-NOT: alloc
- // CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]]
- /// Overwrite BUFFER_CAST_A inplace by copying into the subview.
- // CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]])
- %r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- return %r1: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun_not_inplace
-func @insert_slice_fun_not_inplace(%A : tensor<?xf32>, %t : tensor<4xf32>)
- -> tensor<?xf32>
-{
- // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
- // CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
-
- // CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) : memref<?xf32>
- // CHECK: linalg.copy(%[[BUFFER_CAST_A]], %[[ALLOC]]) : memref<?xf32{{.*}}, memref<?xf32>
- // CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref<?xf32> to memref<4xf32>
- // CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32>
- // CHECK: memref.dealloc %[[ALLOC]] : memref<?xf32>
- %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
- return %r0: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @insert_slice_fun_not_inplace
-func @insert_slice_fun_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- %f0 = constant 0.0 : f32
-
- // CHECK-DAG: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32{{.*}}
- // CHECK-DAG: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast {{.*}} : memref<4xf32{{.*}}
-
- // tensor.insert_slice is bufferized first, %A is inplaceable so we can make this inplace
- // CHECK-DAG: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]][0] [4] [1] : memref<?xf32, {{.*}}> to memref<4xf32, {{.*}}>
- // CHECK-DAG: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]]) : memref<4xf32, {{.*}}>, memref<4xf32, {{.*}}>
- %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // fill would interfere with %r0 that is also being returned.
- // So we need to bufferize it out of place and make a new alloc.
- // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref<?xf32>
- // CHECK-DAG: %[[ALLOC_CAST_DYNAMIC:.*]] = memref.cast %[[ALLOC]] : memref<?xf32> to memref<?xf32, {{.*}}
- // CHECK: linalg.fill(%{{.*}}, %[[ALLOC_CAST_DYNAMIC]]
- // CHECK: memref.dealloc %[[ALLOC]] : memref<?xf32>
- %r1 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
-
- // CHECK-DAG: %[[RET_A:.*]] = memref.tensor_load %[[BUFFER_CAST_A]] : memref<?xf32, {{.*}}
- // CHECK-DAG: %[[RET_B:.*]] = memref.tensor_load %[[ALLOC_CAST_DYNAMIC]] : memref<?xf32, {{.*}}
- // CHECK: return %[[RET_B]], %[[RET_A]]
- return %r1, %r0: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @extract_slice_fun
-func @extract_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true})
- -> tensor<4xf32>
-{
- // This bufferizes to a pattern that the cross-function boundary pass needs to
- // convert into a new memref argument at all call site; this may be either:
- // - an externally created aliasing subview (if we want to allow aliasing
- // function arguments).
- // - a new alloc + copy (more expensive but does not create new function
- // argument aliasing).
- // CHECK-NOT: alloc
- // CHECK-NOT: copy
- // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
- // CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]][0] [4] [1]
- // CHECK: %[[RES:.*]] = memref.tensor_load %[[SV]]
- %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
-
- // CHECK: return %[[RES]]
- return %r0: tensor<4xf32>
-}
-
-//===----------------------------------------------------------------------===//
-// Simple loop cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @scf_for_yield_only
-func @scf_for_yield_only(%A : tensor<?xf32>,
- %B : tensor<?xf32> {linalg.inplaceable = true},
- %lb : index, %ub : index, %step : index)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- // CHECK: %[[ALLOC_FOR_A:.*]] = memref.alloc
- // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast
- // CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast
- // CHECK: linalg.copy(%[[BUFFER_CAST_A]], %[[ALLOC_FOR_A]])
-
- // The first scf.for remains but just turns into dead code.
- %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
- scf.yield %t : tensor<?xf32>
- }
-
- // The second scf.for remains but just turns into dead code.
- %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
- scf.yield %t : tensor<?xf32>
- }
-
- // Cross function call alloc/dealloc pattern must be hoist out.
- // CHECK: memref.dealloc %[[ALLOC_FOR_A]] : memref<?xf32>
- // CHECK: %[[rA:.*]] = memref.tensor_load %[[ALLOC_FOR_A]]
- // Returning tensor_load of the buffer cast makes the %r1 loop dead.
- // CHECK: %[[rB:.*]] = memref.tensor_load %[[BUFFER_CAST_B:.*]]
- // CHECK: return %[[rA]], %[[rB]] : tensor<?xf32>, tensor<?xf32>
- return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
-func @scf_for_with_tensor.insert_slice(
- %A : tensor<?xf32>,
- %B : tensor<?xf32> {linalg.inplaceable = true},
- %C : tensor<4xf32>,
- %lb : index, %ub : index, %step : index)
- -> (tensor<?xf32>, tensor<?xf32>)
-{
- // CHECK: %[[ALLOC_FOR_A:.*]] = memref.alloc
- // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast
- // CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast
- // CHECK: %[[BUFFER_CAST_C:.*]] = memref.buffer_cast
- // CHECK: linalg.copy(%[[BUFFER_CAST_A]], %[[ALLOC_FOR_A]])
-
- // CHECK: scf.for {{.*}} iter_args(%[[bbA:.*]] = %{{.*}}, %[[bbB:.*]] = %{{.*}})
- %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
- -> (tensor<?xf32>, tensor<?xf32>)
- {
- // CHECK: %[[svA:.*]] = memref.subview %[[ALLOC_FOR_A]][0] [4] [1]
- // %ttA bufferizes to direct copy of %BUFFER_CAST_C into %svA
- // CHECK: linalg.copy(%[[BUFFER_CAST_C]], %[[svA]])
- %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // %ttB bufferizes to direct copy of %BUFFER_CAST_C into %BUFFER_CAST_B
- // CHECK: %[[svB:.*]] = memref.subview %[[BUFFER_CAST_B]][0] [4] [1]
- // CHECK: linalg.copy(%[[BUFFER_CAST_C]], %[[svB]])
- %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
- // Yielding bbA and bbB will canonicalize away into oblivion.
- // CHECK: scf.yield %[[bbA]], %[[bbB]] : tensor<?xf32>, tensor<?xf32>
- scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
- }
-
- // CHECK: memref.dealloc %[[ALLOC_FOR_A]] : memref<?xf32>
- // CHECK: %[[rA:.*]] = memref.tensor_load %[[ALLOC_FOR_A]] : memref<?xf32>
- // CHECK: %[[rB:.*]] = memref.tensor_load %[[BUFFER_CAST_B]] : memref<?xf32, #map>
- // CHECK: return %[[rA]], %[[rB]] : tensor<?xf32>, tensor<?xf32>
- return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
index 108119467ea63..a580cbb36060f 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
@@ -1,5 +1,483 @@
// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=test-analysis-only -split-input-file | FileCheck %s
+//===----------------------------------------------------------------------===//
+// Simple cases
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_fun
+func @extract_slice_fun(%A : tensor<?xf32>, %B : tensor<?xf32> {linalg.inplaceable = true})
+ -> (tensor<4xf32>, tensor<8xf32>)
+{
+ // tensor.extract_slice is not used in a write, it is not compelled to
+ // bufferize out of place. Let callers decide whether they want to create
+ // aliasing subviews at all call sites or whether they allocate.
+ // This is true irrespective of whether the function argument is inplaceable.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %r1 = tensor.extract_slice %B[0][8][1] : tensor<?xf32> to tensor<8xf32>
+
+ return %r0, %r1: tensor<4xf32>, tensor<8xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @insert_slice_fun
+func @insert_slice_fun(
+ %A : tensor<?xf32>,
+ %B : tensor<?xf32> {linalg.inplaceable = true},
+ %C : tensor<4xf32>)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ // must bufferize out of place.
+ // CHECK: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ %r0 = tensor.insert_slice %C into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // bufferizes inplace.
+ // CHECK: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %r1 = tensor.insert_slice %C into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @conflict_on_B
+func @conflict_on_B(
+ %A : tensor<4x4xf32> {linalg.inplaceable = true},
+ %B : tensor<4x4xf32> {linalg.inplaceable = true})
+ -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>)
+{
+ // matmul output operand interferes with input operand.
+ // CHECK: linalg.matmul
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ %C = linalg.matmul ins(%A, %B: tensor<4x4xf32>, tensor<4x4xf32>)
+ outs(%B: tensor<4x4xf32>)
+ -> tensor<4x4xf32>
+
+ // matmul output operand interferes with input operand.
+ // CHECK: linalg.matmul
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ %D = linalg.matmul ins(%B, %A: tensor<4x4xf32>, tensor<4x4xf32>)
+ outs(%B: tensor<4x4xf32>)
+ -> tensor<4x4xf32>
+
+ // matmul output operand does not interferes with input operand.
+ // CHECK: linalg.matmul
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
+ outs(%B: tensor<4x4xf32>)
+ -> tensor<4x4xf32>
+
+ return %C, %D, %E: tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Length-1 producer-consumer cases.
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_extract_slice
+func @extract_slice_extract_slice(
+ %A : tensor<?xf32> {linalg.inplaceable = true}, %B : tensor<?xf32>)
+ -> (tensor<2xf32>, tensor<2xf32>)
+{
+ // tensor.extract_slice is not used in a write, it is not compelled to
+ // bufferize out of place. Let callers decide whether they want to create
+ // aliasing subviews at all call sites or whether they allocate.
+ // This is true irrespective of whether the function argument is inplaceable.
+ // CHECK: {__inplace_results_attr__ = ["true"]}
+ %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+ // CHECK: {__inplace_results_attr__ = ["true"]}
+ %r1 = tensor.extract_slice %r0[0][2][1] : tensor<4xf32> to tensor<2xf32>
+
+ // CHECK: {__inplace_results_attr__ = ["true"]}
+ %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+ // CHECK: {__inplace_results_attr__ = ["true"]}
+ %r3 = tensor.extract_slice %r2[0][2][1] : tensor<4xf32> to tensor<2xf32>
+
+ return %r1, %r3: tensor<2xf32>, tensor<2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @insert_slice_insert_slice
+func @insert_slice_insert_slice(
+ %A : tensor<?xf32> {linalg.inplaceable = true},
+ %A2 : tensor<4xf32> {linalg.inplaceable = true},
+ %A3 : tensor<2xf32> {linalg.inplaceable = true},
+ %B : tensor<?xf32>, %B2 : tensor<4xf32>, %B3 : tensor<2xf32>)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ // CHECK: {__inplace_results_attr__ = ["true"]}
+ %r0 = tensor.insert_slice %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32>
+
+ // CHECK: {__inplace_results_attr__ = ["true"]}
+ %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // CHECK: {__inplace_results_attr__ = ["false"]}
+ %r2 = tensor.insert_slice %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32>
+
+ // CHECK: {__inplace_results_attr__ = ["false"]}
+ %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ return %r1, %r3: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_nonmatching_insert_slice
+func @extract_slice_nonmatching_insert_slice(
+ %A : tensor<?xf32> {linalg.inplaceable = true},
+ %B : tensor<?xf32>, %idx: index)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ // %r1 bufferizes inplace because %A is inplaceable.
+ // %r0 is an overlapping tensor.extract_slice that does not match, it must be
+ // out of place.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+ // %r1 can bufferize inplace fine.
+ // CHECK: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // %r3 does bufferizes inplace because %B is not inplaceable.
+ // %r0 is an overlapping tensor.extract_slice that does not match, but does
+ // not alias with the buffer coming from %r3 so it can actually bufferize
+ // inplace.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+ // %r3 cannot bufferize inplace since %B is not inplaceable.
+ // CHECK: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ %r3 = tensor.insert_slice %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ return %r1, %r3: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_matching_insert_slice
+func @extract_slice_matching_insert_slice(
+ %A : tensor<?xf32> {linalg.inplaceable = true},
+ %B : tensor<?xf32>)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ // %r1 bufferizes inplace because %A is inplaceable.
+ // %r0 is a tensor.extract_slice that matches, it can also be bufferized
+ // inplace.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+ // CHECK: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // %r2 is a tensor.extract_slice that matches %r3, it can be bufferized
+ // inplace.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+ // tensor.insert_slice cannot bufferize inplace.
+ // This should have been captured by a canonicalization pattern and it would
+ // be unproductive to have special logic in bufferization to encode matching
+ // insert_slice(extract_slice(A), A).
+ // CHECK: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ return %r1, %r3: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_linalg_readonly_use
+func @extract_slice_linalg_readonly_use(
+ %A : tensor<?x?xf32>,
+ %B : tensor<4x4xf32>,
+ %C : tensor<4x4xf32> {linalg.inplaceable = true})
+ -> (tensor<4x4xf32>, tensor<4x4xf32>)
+{
+ // tensor.extract_slice is only used as a read, no interference irrespective
+ // of user's inplace status.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %sA = tensor.extract_slice %A[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+
+ // matmul output operand is not inplaceable at the function boundary.
+ // CHECK: linalg.matmul
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ %D = linalg.matmul ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
+ outs(%B: tensor<4x4xf32>)
+ -> tensor<4x4xf32>
+
+ // matmul output operand is inplaceable at the function boundary.
+ // CHECK: linalg.matmul
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %E = linalg.matmul ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
+ outs(%C: tensor<4x4xf32>)
+ -> tensor<4x4xf32>
+
+ return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_to_linalg_write_use
+func @extract_slice_to_linalg_write_use(
+ %A : tensor<4x4xf32>,
+ %B : tensor<?x?xf32>,
+ %C : tensor<?x?xf32> {linalg.inplaceable = true})
+ -> (tensor<4x4xf32>, tensor<4x4xf32>)
+{
+ // Step 3. %sB forward propagates to a write in %D but it is not inplace.
+ // So this is only ever read and can bufferize inplace.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+
+ // Step 2. %sB has a read interference in %E, it does not bufferize inplace.
+ // CHECK: linalg.matmul
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ %D = linalg.matmul ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
+ outs(%sB: tensor<4x4xf32>)
+ -> tensor<4x4xf32>
+
+ // Step 4. %sC forward propagates to an inplace write in %E.
+ // %sC backward propagates to %C which is inplaceable.
+ // As a consequence this is bufferized inplace.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+
+ // Step 1. %sC backprops to the tensor.extract_slice producer which is not
+ // considered an interference. This bufferizes inplace.
+ // CHECK: linalg.matmul
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %E = linalg.matmul ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>)
+ outs(%sC: tensor<4x4xf32>)
+ -> tensor<4x4xf32>
+
+ return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Transitive cases
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @extract_slice_to_linalg_write_use
+func @extract_slice_to_linalg_write_use(
+ %A : tensor<4x4xf32>,
+ %B : tensor<?x?xf32>,
+ %C : tensor<?x?xf32> {linalg.inplaceable = true})
+ -> (tensor<4x4xf32>, tensor<4x4xf32>)
+{
+ // Step 4. %sB forward propagates to an inplace write in %D.
+ // %sB backward propagates to %B which is not inplaceable.
+ // As a consequence this is bufferized out of place.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+
+ // Step 1. %sB backprops to the tensor.extract_slice producer which is not
+ // considered an interference. This bufferizes inplace.
+ // CHECK: linalg.matmul
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %D = linalg.matmul ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
+ outs(%sB: tensor<4x4xf32>)
+ -> tensor<4x4xf32>
+
+ // Step 3. %sC forward propagates to an inplace write in %E.
+ // %sC backward propagates to %C which is inplaceable.
+ // As a consequence this is bufferized inplace.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+
+ // Step 1. %sC backprops to the tensor.extract_slice producer which is not
+ // considered an interference. This bufferizes inplace.
+ // CHECK: linalg.matmul
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
+ outs(%sC: tensor<4x4xf32>)
+ -> tensor<4x4xf32>
+
+ return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @nested_extract_slice_and_insert
+func @nested_extract_slice_and_insert(
+ %A : tensor<?x?xf32>,
+ %B : tensor<?x?xf32> {linalg.inplaceable = true},
+ %C : tensor<?x?xf32> {linalg.inplaceable = true},
+ %idx : index)
+ -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)
+{
+ %f0 = constant 0.0 : f32
+
+ // 2-level matching tensor.extract_slice / tensor.insert_slice into non
+ // inplaceable %A.
+ // - %rA is not inplaceable because %A is not inplaceable at function boundary.
+ // - once %rA is deemed not inplaceable, nothing prevent %rsA to be inplaceable
+ // - this propagates to %FA and %ssA being inplaceable.
+ // - %sA would then bufferize to an inplace write (i.e. %FA) but %A is not
+ // inplaceable and so %sA is not inplaceable.
+ // CHECK: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ // CHECK-NEXT: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: fill
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+ %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+ %FA = linalg.fill(%f0, %ssA) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
+ %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
+ %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+
+ // 3-level matching tensor.extract_slice / tensor.insert_slice into
+ // inplaceable %B.
+ // CHECK-NEXT: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: tensor.extract_slice
+ // Atm, this 2nd tensor.extract_slice fails to bufferize inplace because
+ // clobbering analysis conservatively test for equivalent buffers.
+ // TODO: This is currently too restrictive and misses clobberings.
+ // When available, use container-containee analysis.
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ // CHECK-NEXT: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: fill
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+ %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
+ %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32>
+ %FB = linalg.fill(%f0, %sssB) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
+ %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32>
+ %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor<?x?xf32>
+ %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+
+ // 2-level matching tensor.extract_slice / tensor.insert_slice into
+ // inplaceable %C with a twist.
+ // Throw a wrench in the system: %rsC production sizes do not match %ssC.
+ // CHECK-NEXT: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // The tensor.insert_slice that would be candidate for matching does not actually
+ // match. That tensor.insert_slice can still be bufferized inplace nonetheless
+ // but this tensor.extract_slice, which bufferizes to an inplace write, cannot.
+ // CHECK-NEXT: tensor.extract_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+ // CHECK-NEXT: fill
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+ %ssC = tensor.extract_slice %sC[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
+ %FC = linalg.fill(%f0, %ssC) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
+ %rsC = tensor.insert_slice %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
+ %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+
+ return %rA, %rB, %rC: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Simple loop cases
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @scf_for_yield_only
+func @scf_for_yield_only(%A : tensor<?xf32>,
+ %B : tensor<?xf32> {linalg.inplaceable = true},
+ %lb : index, %ub : index, %step : index)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ // CHECK: scf.for
+ // CHECK-NEXT: scf.yield
+ // CHECK-NEXT: {__inplace_results_attr__ = ["false"]}
+ %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
+ scf.yield %t : tensor<?xf32>
+ }
+
+ // CHECK: scf.for
+ // CHECK-NEXT: scf.yield
+ // CHECK-NEXT: {__inplace_results_attr__ = ["true"]}
+ %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
+ scf.yield %t : tensor<?xf32>
+ }
+
+ return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
+func @scf_for_with_tensor.insert_slice(%A : tensor<?xf32>,
+ %B : tensor<?xf32> {linalg.inplaceable = true},
+ %C : tensor<4xf32>,
+ %lb : index, %ub : index, %step : index)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ // CHECK: scf.for
+ // scf.for bbArgs are always inplaceable seen from ops inside the body:
+ // 1. Either the matching tensor is not inplaceable and an alloc occurs
+ // which makes bbArg inplaceable.
+ // 2. Or it is already inplaceable and so is bbArg.
+ // CHECK-NEXT: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: tensor.insert_slice
+ // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+ // CHECK-NEXT: scf.yield
+ // CHECK-NEXT: {__inplace_results_attr__ = ["false", "true"]}
+ %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+ -> (tensor<?xf32>, tensor<?xf32>)
+ {
+ %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+ %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+ scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
+ }
+
+ return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Cross function boundary cases.
+//===----------------------------------------------------------------------===//
+
func private @foo(tensor<64xf32>)
// CHECK-LABEL: dependence_through_call
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
index d6a6d7c67f6cf..78f84cc8540c4 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
@@ -44,3 +44,44 @@ func @bar() {
call @foo() : () -> ()
return
}
+
+// -----
+
+func @scf_for(%A : tensor<?xf32>,
+ %B : tensor<?xf32> {linalg.inplaceable = true},
+ %C : tensor<4xf32>,
+ %lb : index, %ub : index, %step : index)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+ -> (tensor<?xf32>, tensor<?xf32>)
+ {
+ %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+ %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // Throw a wrench in the system by swapping yielded values: this result in a
+ // ping-pong of values at each iteration on which we currently want to fail.
+
+ // expected-error @+1 {{Yield operand #1 does not bufferize to an equivalent buffer}}
+ scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
+ }
+
+ return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+func @extract_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true})
+ -> tensor<4xf32>
+{
+ // This bufferizes to a pattern that the cross-function boundary pass needs to
+ // convert into a new memref argument at all call site; this may be either:
+ // - an externally created aliasing subview (if we want to allow aliasing
+ // function arguments).
+ // - a new alloc + copy (more expensive but does not create new function
+ // argument aliasing).
+ %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
+
+ // expected-error @+1 {{buffer result #0 not produced by an alloc}}
+ return %r0: tensor<4xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
index b71f6f92d51ed..bc6488bca8e58 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -1,5 +1,355 @@
// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize -split-input-file | FileCheck %s
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @fill_inplace(
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+func @fill_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}) -> tensor<?xf32> {
+ // CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32
+ %f0 = constant 0.0 : f32
+
+ /// Inplaceable, no alloc
+ // CHECK-NOT: alloc
+ // CHECK: linalg.fill(%[[F0]], %[[A]]) : f32, memref<?xf32, #[[$map_1d_dyn]]>
+ %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
+
+ // CHECK: return
+ // CHECK-NOT: tensor
+ return %r: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+/// No linalg.inplaceable flag, must allocate.
+// CHECK-LABEL: func @not_inplace(
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>) -> memref<?xf32> {
+func @not_inplace(%A : tensor<?xf32>) -> tensor<?xf32> {
+ // CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32
+ %f0 = constant 0.0 : f32
+
+ // CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, #[[$map_1d_dyn]]>
+ // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) : memref<?xf32>
+ // CHECK: linalg.fill(%[[F0]], %[[ALLOC]]) : f32, memref<?xf32>
+ %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
+
+ // CHECK: dealloc %[[ALLOC]] : memref<?xf32>
+ // CHECK: return %[[ALLOC]] : memref<?xf32>
+ return %r: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_2d_dyn:.*]] = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>
+
+// CHECK-LABEL: func @not_inplace
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?x?xf32, #[[$map_2d_dyn]]>) {
+func @not_inplace(%A : tensor<?x?xf32> {linalg.inplaceable = true}) -> tensor<?x?xf32> {
+ %f0 = constant 0.0 : f32
+
+ /// Cross-op multiple uses of %A, the first op which has interfering reads must alloc.
+ // CHECK: %[[ALLOC:.*]] = memref.alloc
+ // CHECK: linalg.fill({{.*}}, %[[ALLOC]]
+ %f = linalg.fill(%f0, %A) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+
+ /// The second op has no interfering reads and can reuse.
+ // CHECK-NOT: alloc
+ // CHECK: linalg.matmul ins(%[[ALLOC]], %[[ALLOC]]{{.*}}) outs(%[[A]]
+ %r = linalg.matmul ins(%f, %f: tensor<?x?xf32>, tensor<?x?xf32>)
+ outs(%A: tensor<?x?xf32>)
+ -> tensor<?x?xf32>
+
+ // CHECK: return
+ // CHECK-NOT: tensor
+ return %r: tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @not_inplace
+func @not_inplace(%A : tensor<?x?xf32> {linalg.inplaceable = true}) -> tensor<?x?xf32> {
+ /// Within op multiple uses of %A, must alloc.
+ // CHECK: alloc
+ %r = linalg.matmul ins(%A, %A: tensor<?x?xf32>, tensor<?x?xf32>)
+ outs(%A: tensor<?x?xf32>)
+ -> tensor<?x?xf32>
+ return %r: tensor<?x?xf32>
+}
+// -----
+
+// CHECK-LABEL: func @vec_inplace
+func @vec_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %vec : vector<4xf32>)
+ -> tensor<?xf32>
+{
+ %c0 = constant 0 : index
+
+ // CHECK-NOT: alloc
+ %r = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
+
+ // CHECK: return
+ // CHECK-NOT: tensor
+ return %r: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @vec_not_inplace
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+func @vec_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %vec : vector<4xf32>)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ %c0 = constant 0 : index
+ %c1 = constant 1 : index
+
+ /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
+ // CHECK: %[[ALLOC:.*]] = memref.alloc
+ // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
+ %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
+
+ /// The second vector.transfer has no interfering reads and can reuse the buffer.
+ // CHECK-NOT: alloc
+ // CHECK-NEXT: vector.transfer_write {{.*}}, %[[A]]
+ %r1 = vector.transfer_write %vec, %A[%c1] : vector<4xf32>, tensor<?xf32>
+
+ // CHECK: return
+ // CHECK-NOT: tensor
+ return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @insert_slice_fun
+// CHECK-SAME: %[[A0:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>,
+// CHECK-SAME: %[[A1:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>,
+// CHECK-SAME: %[[t0:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>,
+// CHECK-SAME: %[[t1:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @insert_slice_fun(%A0 : tensor<?xf32>,
+ %A1 : tensor<?xf32> {linalg.inplaceable = true},
+ %t0 : tensor<4xf32>,
+ %t1 : tensor<4xf32> {linalg.inplaceable = true})
+ -> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
+{
+ // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
+ // CHECK: %[[REALLOC_A0:.*]] = memref.alloc
+ // CHECK: linalg.copy(%[[A0]], %[[REALLOC_A0]]
+ // CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC_A0]]
+ // CHECK: linalg.copy(%[[t0]], %[[SV_A0]])
+ %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
+ // CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
+ // CHECK: linalg.copy(%[[A0]]
+ // CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]]
+ // CHECK: linalg.copy(%[[t1]], %[[SV_A0_2]])
+ %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice.
+ // CHECK: %[[REALLOC_A1:.*]] = memref.alloc
+ // CHECK: linalg.copy(%[[A1]]
+ // CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]]
+ // CHECK: linalg.copy(%[[t0]], %[[SV_A1]])
+ %r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // Do not realloc the large tensor. Copy the tensor.extract_slice.
+ // CHECK-NOT: alloc
+ // CHECK: %[[SV_A1_2:.*]] = memref.subview %[[A1]]
+ // CHECK: linalg.copy(%[[t1]], %[[SV_A1_2]])
+ %r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // CHECK: return %[[REALLOC_A0]], %[[REALLOC_A0_2]], %[[REALLOC_A1]] :
+ // CHECK-SAME: memref<?xf32>, memref<?xf32>, memref<?xf32>
+ return %r0, %r1, %r2, %r3: tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @insert_slice_fun
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
+ -> tensor<?xf32>
+{
+ %f0 = constant 0.0 : f32
+
+ // CHECK-NOT: alloc
+ // CHECK: %[[SV_A:.*]] = memref.subview %[[A]]
+ // CHECK: linalg.copy(%[[t]], %[[SV_A]])
+ %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ /// Overwrite A inplace.
+ // CHECK: linalg.fill({{.*}}, %[[A]]
+ %r1 = linalg.fill(%f0, %r0) : f32, tensor<?xf32> -> tensor<?xf32>
+
+ // CHECK: return
+ // CHECK-NOT: tensor
+ return %r1: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @insert_slice_fun
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
+ -> tensor<?xf32>
+{
+ %f0 = constant 0.0 : f32
+
+ // CHECK: linalg.fill({{.*}}, %[[A]]
+ %r0 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
+
+ // CHECK-NOT: alloc
+ // CHECK: %[[SV_A:.*]] = memref.subview %[[A]]
+ /// Overwrite A inplace by copying into the subview.
+ // CHECK: linalg.copy(%[[t]], %[[SV_A]])
+ %r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // CHECK: return
+ // CHECK-NOT: tensor
+ return %r1: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @insert_slice_fun_not_inplace
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @insert_slice_fun_not_inplace(%A : tensor<?xf32>, %t : tensor<4xf32>)
+ -> tensor<?xf32>
+{
+ // CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) : memref<?xf32>
+ // CHECK: linalg.copy(%[[A]], %[[ALLOC]]) : memref<?xf32{{.*}}, memref<?xf32>
+ // CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref<?xf32> to memref<4xf32>
+ // CHECK: linalg.copy(%[[t]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32>
+ // CHECK: memref.dealloc %[[ALLOC]] : memref<?xf32>
+ %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // CHECK: return %{{.*}} : memref<?xf32>
+ return %r0: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @insert_slice_fun_not_inplace
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @insert_slice_fun_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ %f0 = constant 0.0 : f32
+
+ // tensor.insert_slice is bufferized first, %A is inplaceable so we can make this inplace
+ // CHECK-DAG: %[[SV_A:.*]] = memref.subview %[[A]][0] [4] [1] : memref<?xf32, {{.*}}> to memref<4xf32, {{.*}}>
+ // CHECK-DAG: linalg.copy(%[[t]], %[[SV_A]]) : memref<4xf32, {{.*}}>, memref<4xf32, {{.*}}>
+ %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // fill would interfere with %r0 that is also being returned.
+ // So we need to bufferize it out of place and make a new alloc.
+ // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref<?xf32>
+ // CHECK: linalg.fill(%{{.*}}, %[[ALLOC]]
+ %r1 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
+
+ // CHECK: memref.dealloc %[[ALLOC]] : memref<?xf32>
+ // CHECK: return %[[ALLOC]] : memref<?xf32>
+ return %r1, %r0: tensor<?xf32>, tensor<?xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Simple loop cases
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @scf_for_yield_only
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+func @scf_for_yield_only(%A : tensor<?xf32>,
+ %B : tensor<?xf32> {linalg.inplaceable = true},
+ %lb : index, %ub : index, %step : index)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ // CHECK: %[[ALLOC_FOR_A:.*]] = memref.alloc
+ // CHECK: linalg.copy(%[[A]], %[[ALLOC_FOR_A]])
+
+ // The first scf.for remains but just turns into dead code.
+ %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
+ scf.yield %t : tensor<?xf32>
+ }
+
+ // The second scf.for remains but just turns into dead code.
+ %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
+ scf.yield %t : tensor<?xf32>
+ }
+
+ // CHECK: memref.dealloc %[[ALLOC_FOR_A]] : memref<?xf32>
+ // CHECK: return %[[ALLOC_FOR_A]] : memref<?xf32>
+ return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func @scf_for_with_tensor.insert_slice(
+ %A : tensor<?xf32>,
+ %B : tensor<?xf32> {linalg.inplaceable = true},
+ %C : tensor<4xf32>,
+ %lb : index, %ub : index, %step : index)
+ -> (tensor<?xf32>, tensor<?xf32>)
+{
+ // CHECK: %[[ALLOC_FOR_A:.*]] = memref.alloc
+ // CHECK: linalg.copy(%[[A]], %[[ALLOC_FOR_A]])
+
+ // CHECK: %[[svA:.*]] = memref.subview %[[ALLOC_FOR_A]][0] [4] [1]
+ // CHECK: %[[svB:.*]] = memref.subview %[[B]][0] [4] [1]
+
+ // CHECK: scf.for {{.*}}
+ // CHECK-NOT: iter_args
+ %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+ -> (tensor<?xf32>, tensor<?xf32>)
+ {
+ // %ttA bufferizes to direct copy of %BUFFER_CAST_C into %svA
+ // CHECK: linalg.copy(%[[C]], %[[svA]])
+ %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // %ttB bufferizes to direct copy of %BUFFER_CAST_C into %BUFFER_CAST_B
+ // CHECK: linalg.copy(%[[C]], %[[svB]])
+ %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+ // CHECK-NOT: scf.yield
+ scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
+ }
+
+ // CHECK: memref.dealloc %[[ALLOC_FOR_A]] : memref<?xf32>
+ // CHECK: return %[[ALLOC_FOR_A]] : memref<?xf32>
+ return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Cross function boundary cases.
+//===----------------------------------------------------------------------===//
+
// CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
// CHECK: func private @some_external_func(memref<?xf32, #[[$DYN_1D_MAP]]>)
More information about the Mlir-commits
mailing list