[Mlir-commits] [mlir] 1e1a311 - [mlir][bufferization] Privatize buffers for parallel regions
Matthias Springer
llvmlistbot at llvm.org
Wed Sep 6 05:37:30 PDT 2023
Author: Matthias Springer
Date: 2023-09-06T14:28:43+02:00
New Revision: 1e1a3112f123de010c58af08ad87ae1dee20ff05
URL: https://github.com/llvm/llvm-project/commit/1e1a3112f123de010c58af08ad87ae1dee20ff05
DIFF: https://github.com/llvm/llvm-project/commit/1e1a3112f123de010c58af08ad87ae1dee20ff05.diff
LOG: [mlir][bufferization] Privatize buffers for parallel regions
One-Shot Bufferize correctly handles RaW conflicts around repetitive regions (loops). Specical handling is needed for parallel regions. These are a special kind of repetitive regions that can have additional RaW conflicts that would not be present if the regions would be executed sequentially.
Example:
```
%0 = bufferization.alloc_tensor()
scf.forall ... {
%1 = linalg.fill ins(...) outs(%0)
...
scf.forall.in_parallel {
tensor.parallel_insert_slice %1 into ...
}
}
```
A separate (private) buffer must be allocated for each iteration of the `scf.forall` loop.
This change adds a new interface method to `BufferizableOpInterface` to detect parallel regions. By default, regions are assumed to be sequential.
A buffer is privatized if an OpOperand bufferizes to a memory read inside a parallel region that is different from the parallel region where operand's value is defined.
Differential Revision: https://reviews.llvm.org/D159286
Added:
Modified:
mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index b61994e8b9feea..9ec44dfd16a0c0 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -684,6 +684,14 @@ Operation *getOwnerOfValue(Value value);
Region *getNextEnclosingRepetitiveRegion(Region *region,
const BufferizationOptions &options);
+/// If `region` is a parallel region, return `region`. Otherwise, find the first
+/// enclosing parallel region of `region`. If there is no such region, return
+/// "nullptr".
+///
+/// Note: Whether a region is parallel or sequential is queried from the
+/// `BufferizableOpInterface`.
+Region *getParallelRegion(Region *region, const BufferizationOptions &options);
+
namespace detail {
/// This is the default implementation of
/// BufferizableOpInterface::getAliasingOpOperands. Should not be called from
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
index 7433853717f24f..fd1ceb68af5dd9 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
@@ -556,6 +556,25 @@ def BufferizableOpInterface : OpInterface<"BufferizableOpInterface"> {
::llvm::cast<BufferizableOpInterface>($_op.getOperation()), index);
}]
>,
+ InterfaceMethod<
+ /*desc=*/[{
+ Return `true` if the given region of this op is parallel, i.e.,
+ multiple instances of the region may be executing at the same time.
+ If a region is parallel, it must also be marked as "repetitive".
+
+ The RaW conflict detection of One-Shot Analysis is more strict inside
+ parallel regions: Buffer may have to be privatized.
+
+ By default, regions are assumed to be sequential.
+ }],
+ /*retType=*/"bool",
+ /*methodName=*/"isParallelRegion",
+ /*args=*/(ins "unsigned":$index),
+ /*methodBody=*/"",
+ /*defaultImplementation=*/[{
+ return false;
+ }]
+ >,
StaticInterfaceMethod<
/*desc=*/[{
Return `true` if the op and this interface implementation supports
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 34a1625b0daa7b..2e549b0335688c 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -119,6 +119,21 @@ Region *bufferization::getNextEnclosingRepetitiveRegion(
return region;
}
+Region *bufferization::getParallelRegion(Region *region,
+ const BufferizationOptions &options) {
+ while (region) {
+ auto bufferizableOp = options.dynCastBufferizableOp(region->getParentOp());
+ if (bufferizableOp &&
+ bufferizableOp.isParallelRegion(region->getRegionNumber())) {
+ assert(isRepetitiveRegion(region, options) &&
+ "expected that all parallel regions are also repetitive regions");
+ return region;
+ }
+ region = region->getParentRegion();
+ }
+ return nullptr;
+}
+
Operation *bufferization::getOwnerOfValue(Value value) {
if (auto opResult = llvm::dyn_cast<OpResult>(value))
return opResult.getDefiningOp();
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index 49b5ebdf722a1a..9dd4be9a1a08eb 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -545,6 +545,43 @@ hasReadAfterWriteInterference(const DenseSet<OpOperand *> &usesRead,
OneShotAnalysisState &state) {
const BufferizationOptions &options = state.getOptions();
+ // Before going through the main RaW analysis, find cases where a buffer must
+ // be privatized due to parallelism. If the result of a write is never read,
+ // privatization is not necessary (and large parts of the IR are likely dead).
+ if (!usesRead.empty()) {
+ for (OpOperand *uConflictingWrite : usesWrite) {
+ // Find the allocation point or last write (definition) of the buffer.
+ // Note: In contrast to `findDefinitions`, this also returns results of
+ // ops that do not bufferize to memory write when no other definition
+ // could be found. E.g., "bufferization.alloc_tensor" would be included,
+ // even though that op just bufferizes to an allocation but does define
+ // the contents of the buffer.
+ SetVector<Value> definitionsOrLeaves =
+ state.findValueInReverseUseDefChain(
+ uConflictingWrite->get(),
+ [&](Value v) { return state.bufferizesToMemoryWrite(v); });
+ assert(!definitionsOrLeaves.empty() &&
+ "expected at least one definition or leaf");
+
+ // The writing op must bufferize out-of-place if the definition is in a
+ //
diff erent parallel region than this write.
+ for (Value def : definitionsOrLeaves) {
+ if (getParallelRegion(def.getParentRegion(), options) !=
+ getParallelRegion(uConflictingWrite->getOwner()->getParentRegion(),
+ options)) {
+ LLVM_DEBUG(
+ llvm::dbgs()
+ << "\n- bufferizes out-of-place due to parallel region:\n");
+ LLVM_DEBUG(llvm::dbgs()
+ << " unConflictingWrite = operand "
+ << uConflictingWrite->getOperandNumber() << " of "
+ << *uConflictingWrite->getOwner() << "\n");
+ return true;
+ }
+ }
+ }
+ }
+
for (OpOperand *uRead : usesRead) {
Operation *readingOp = uRead->getOwner();
LLVM_DEBUG(llvm::dbgs() << "\n- check conflict:\n");
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 1a604c00e4321f..11cfefed890c66 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1202,6 +1202,10 @@ struct ForallOpInterface
}
return false;
}
+
+ bool isParallelRegion(Operation *op, unsigned index) const {
+ return isRepetitiveRegion(op, index);
+ }
};
/// Nothing to do for InParallelOp.
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
index d78472c8bf81b4..1b9143bde6821a 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -798,3 +798,106 @@ func.func @nesting_op_repetitive_regions(
}
return
}
+
+// -----
+
+// CHECK-LABEL: func @parallel_region()
+func.func @parallel_region() -> tensor<320xf32>
+{
+ %alloc0 = bufferization.alloc_tensor() : tensor<320xf32>
+ %alloc1 = bufferization.alloc_tensor() : tensor<1xf32>
+ %c320 = arith.constant 320 : index
+ // CHECK: scf.forall
+ %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) {
+ %val = "test.foo"() : () -> (f32)
+ // linalg.fill must bufferize out-of-place because every thread needs a
+ // private copy of %alloc1.
+ // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+ %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32>
+ scf.forall.in_parallel {
+ // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+ tensor.parallel_insert_slice %fill into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+ }
+ }
+ // CHECK: } {__inplace_operands_attr__ = ["none", "true"]}
+ return %0 : tensor<320xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @parallel_region_mixed_def(
+func.func @parallel_region_mixed_def(%c: i1) -> tensor<320xf32>
+{
+ %alloc0 = bufferization.alloc_tensor() : tensor<320xf32>
+ %alloc1 = bufferization.alloc_tensor() : tensor<1xf32>
+ %c320 = arith.constant 320 : index
+ // CHECK: scf.forall
+ %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) {
+ %alloc2 = bufferization.alloc_tensor() : tensor<1xf32>
+ %selected = scf.if %c -> tensor<1xf32> {
+ scf.yield %alloc1 : tensor<1xf32>
+ } else {
+ scf.yield %alloc2 : tensor<1xf32>
+ }
+ %val = "test.foo"() : () -> (f32)
+ // linalg.fill must bufferize out-of-place because every thread needs a
+ // private copy of %alloc1.
+ // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+ %fill = linalg.fill ins(%val : f32) outs(%selected : tensor<1xf32>) -> tensor<1xf32>
+ scf.forall.in_parallel {
+ // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+ tensor.parallel_insert_slice %fill into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+ }
+ }
+ // CHECK: } {__inplace_operands_attr__ = ["none", "true"]}
+ return %0 : tensor<320xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @parallel_region_two_writes(
+func.func @parallel_region_two_writes(%f: f32) -> tensor<320xf32>
+{
+ %alloc0 = bufferization.alloc_tensor() : tensor<320xf32>
+ %alloc1 = bufferization.alloc_tensor() : tensor<1xf32>
+ %c320 = arith.constant 320 : index
+ %c0 = arith.constant 0 : index
+ // CHECK: scf.forall
+ %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) {
+ %val = "test.foo"() : () -> (f32)
+ // linalg.fill must bufferize out-of-place because every thread needs a
+ // private copy of %alloc1.
+ // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+ %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32>
+ // CHECK: tensor.insert
+ // CHECK-SAME: __inplace_operands_attr__ = ["none", "true", "none"]
+ %inserted = tensor.insert %f into %fill[%c0] : tensor<1xf32>
+
+ scf.forall.in_parallel {
+ // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+ tensor.parallel_insert_slice %inserted into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+ }
+ }
+ // CHECK: } {__inplace_operands_attr__ = ["none", "true"]}
+ return %0 : tensor<320xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @parallel_region_no_read()
+func.func @parallel_region_no_read()
+{
+ %alloc0 = bufferization.alloc_tensor() : tensor<320xf32>
+ %alloc1 = bufferization.alloc_tensor() : tensor<1xf32>
+ %c320 = arith.constant 320 : index
+ // CHECK: scf.forall
+ scf.forall (%arg0) in (%c320) {
+ %val = "test.foo"() : () -> (f32)
+ // linalg.fill can bufferize in-place because no alias of %alloc1 is read.
+ // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
+ %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32>
+ scf.forall.in_parallel {
+ }
+ }
+ return
+}
More information about the Mlir-commits
mailing list