[Mlir-commits] [mlir] Avoid buffer hoisting from parallel loops (PR #90735)

Wed May 1 07:47:21 PDT 2024

llvmbot wrote:



@llvm/pr-subscribers-mlir-scf

@llvm/pr-subscribers-mlir

Author: Rafael Ubal (rafaelubalmw)

<details>
<summary>Changes</summary>

This change corrects an invalid behavior in pass `--buffer-loop-hoisting`. The pass is in charge of extracting buffer allocations (e.g., `memref.alloca`) from loop regions (e.g., `scf.for`) when possible. This works OK for looks with sequential execution semantics. However, a buffer allocated in the body of a parallel loop may be concurrently accessed by multiple thread to store its local data. Extracting such buffer from the loop causes all threads to wrongly share the same memory region.

In the following example, dimension 1 of the input tensor is reversed. Dimension 0 is traversed with a parallel loop.

```
func.func @f(%input: memref<2x3xf32>) -> memref<2x3xf32> {
  %c0 = index.constant 0
  %c1 = index.constant 1
  %c2 = index.constant 2
  %c3 = index.constant 3

  %output = memref.alloc() : memref<2x3xf32>
  scf.parallel (%index) = (%c0) to (%c2) step (%c1) {
    // Create subviews for working input and output slices
    %input_slice = memref.subview %input[%index, 2][1, 3][1, -1] : memref<2x3xf32> to memref<1x3xf32, strided<[3, -1], offset: ?>>
    %output_slice = memref.subview %output[%index, 0][1, 3][1, 1] : memref<2x3xf32> to memref<1x3xf32, strided<[3, 1], offset: ?>>

    // Copy the input slice into this temporary buffer. This intermediate
    // copy is unnecessary, but is used for illustration purposes.
    %temp = memref.alloc() : memref<1x3xf32>
    memref.copy %input_slice, %temp : memref<1x3xf32, strided<[3, -1], offset: ?>> to memref<1x3xf32>

    // Copy temporary buffer into output slice
    memref.copy %temp, %output_slice : memref<1x3xf32> to memref<1x3xf32, strided<[3, 1], offset: ?>>
    scf.reduce
  }

  return %output : memref<2x3xf32>
}
```

The patch submitted here prevents `%temp = memref.alloc() : memref<1x3xf32>` from being hoisted when the containing op is `scf.parallel` or `scf.forall`. A new op trait called `HasParallelRegion` is introduced and assigned to these two ops to indicate that their regions have parallel execution semantics.

@joker-eph @ftynse @nicolasvasilache @sabauma 

---
Full diff: https://github.com/llvm/llvm-project/pull/90735.diff


5 Files Affected:

- (modified) mlir/include/mlir/Dialect/SCF/IR/SCFOps.td (+4-2) 
- (modified) mlir/include/mlir/IR/OpBase.td (+2) 
- (modified) mlir/include/mlir/IR/OpDefinition.h (+12) 
- (modified) mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp (+12-5) 
- (modified) mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir (+32) 


``````````diff

diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index b3d085bfff1af9..41a0b67c42a0bf 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -307,7 +307,8 @@ def ForallOp : SCF_Op<"forall", [
        RecursiveMemoryEffects,
        SingleBlockImplicitTerminator<"scf::InParallelOp">,
        DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-       DestinationStyleOpInterface
+       DestinationStyleOpInterface,
+       HasParallelRegion
      ]> {
   let summary = "evaluate a block multiple times in parallel";
   let description = [{
@@ -764,7 +765,8 @@ def ParallelOp : SCF_Op<"parallel",
           "getSingleLowerBound", "getSingleUpperBound", "getSingleStep"]>,
      RecursiveMemoryEffects,
      DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-     SingleBlockImplicitTerminator<"scf::ReduceOp">]> {
+     SingleBlockImplicitTerminator<"scf::ReduceOp">,
+     HasParallelRegion]> {
   let summary = "parallel for operation";
   let description = [{
     The "scf.parallel" operation represents a loop nest taking 4 groups of SSA
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 7866ac24c1ccba..6cd0d38a906a45 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -79,6 +79,8 @@ def Idempotent : NativeOpTrait<"IsIdempotent">;
 def Involution : NativeOpTrait<"IsInvolution">;
 // Op behaves like a constant.
 def ConstantLike : NativeOpTrait<"ConstantLike">;
+// Op contains a region with parallel execution semantics
+def HasParallelRegion : NativeOpTrait<"HasParallelRegion">;
 // Op is isolated from above.
 def IsolatedFromAbove : NativeOpTrait<"IsIsolatedFromAbove">;
 // Op results are float or vectors/tensors thereof.
diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
index 59f094d6690991..38b286c01b57c8 100644
--- a/mlir/include/mlir/IR/OpDefinition.h
+++ b/mlir/include/mlir/IR/OpDefinition.h
@@ -1242,6 +1242,18 @@ class ConstantLike : public TraitBase<ConcreteType, ConstantLike> {
   }
 };
 
+// A trait indicating that the single region contained in the operation has
+// parallel execution semantics. This may have implications in a certain pass.
+// For example, buffer hoisting is illegal in parallel loops, and local buffers
+// may be accessed by parallel threads simultaneously.
+template <typename ConcreteType>
+class HasParallelRegion : public TraitBase<ConcreteType, HasParallelRegion> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOneRegion(op);
+  }
+};
+
 /// This class provides the API for ops that are known to be isolated from
 /// above.
 template <typename ConcreteType>
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
index 9dc2f262a51161..d7056f35cbc8d5 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
@@ -59,6 +59,12 @@ static bool isLoop(Operation *op) {
   return regionInterface.hasLoop();
 }
 
+/// Return whether the given operation is a loop with sequential execution
+/// semantics.
+static bool isSequentialLoop(Operation *op) {
+  return !op->hasTrait<OpTrait::HasParallelRegion>() && isLoop(op);
+}
+
 /// Returns true if the given operation implements the AllocationOpInterface
 /// and it supports the dominate block hoisting.
 static bool allowAllocDominateBlockHoisting(Operation *op) {
@@ -338,12 +344,13 @@ struct BufferAllocationLoopHoistingState : BufferAllocationHoistingStateBase {
     return dependencyBlock ? dependencyBlock : nullptr;
   }
 
-  /// Returns true if the given operation represents a loop and one of the
-  /// aliases caused the `aliasDominatorBlock` to be "above" the block of the
-  /// given loop operation. If this is the case, it indicates that the
-  /// allocation is passed via a back edge.
+  /// Returns true if the given operation represents a loop with sequential
+  /// execution semantics and one of the aliases caused the
+  /// `aliasDominatorBlock` to be "above" the block of the given loop operation.
+  /// If this is the case, it indicates that the allocation is passed via a back
+  /// edge.
   bool isLegalPlacement(Operation *op) {
-    return isLoop(op) &&
+    return isSequentialLoop(op) &&
            !dominators->dominates(aliasDominatorBlock, op->getBlock());
   }
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir b/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir
index 5f8bc18b64da1e..ee7571b2c84669 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir
@@ -461,6 +461,38 @@ func.func @partial_hoist_multiple_loop_dependency(
 
 // -----
 
+// CHECK-LABEL: func @no_hoist_parallel
+func.func @no_hoist_parallel(
+    %lb: index,
+    %ub: index,
+    %step: index) {
+  scf.parallel (%i) = (%lb) to (%ub) step (%step) {
+      %0 = memref.alloc() : memref<2xf32>
+      scf.reduce
+  }
+  return
+}
+
+//      CHECK: memref.alloc
+// CHECK-NEXT: scf.reduce
+
+// -----
+
+func.func @no_hoist_forall(
+    %lb: index,
+    %ub: index,
+    %step: index) {
+  scf.forall (%i) = (%lb) to (%ub) step (%step) {
+      %1 = memref.alloc() : memref<2xf32>
+  }
+  return
+}
+
+//      CHECK: scf.forall
+// CHECK-NEXT: memref.alloc
+
+// -----
+
 // Test with allocas to ensure that op is also considered.
 
 // CHECK-LABEL: func @hoist_alloca

``````````

</details>


https://github.com/llvm/llvm-project/pull/90735