[Mlir-commits] [mlir] a42a2ca - Avoid buffer hoisting from parallel loops (#90735)

Fri May 3 23:35:41 PDT 2024

Author: Rafael Ubal
Date: 2024-05-04T08:35:36+02:00
New Revision: a42a2ca19b2325fa6844d6b10e88eb53a3f2fde8

URL: https://github.com/llvm/llvm-project/commit/a42a2ca19b2325fa6844d6b10e88eb53a3f2fde8
DIFF: https://github.com/llvm/llvm-project/commit/a42a2ca19b2325fa6844d6b10e88eb53a3f2fde8.diff

LOG: Avoid buffer hoisting from parallel loops (#90735)

This change corrects an invalid behavior in pass
`--buffer-loop-hoisting`. The pass is in charge of extracting buffer
allocations (e.g., `memref.alloca`) from loop regions (e.g., `scf.for`)
when possible. This works OK for looks with sequential execution
semantics. However, a buffer allocated in the body of a parallel loop
may be concurrently accessed by multiple thread to store its local data.
Extracting such buffer from the loop causes all threads to wrongly share
the same memory region.

In the following example, dimension 1 of the input tensor is reversed.
Dimension 0 is traversed with a parallel loop.

```
func.func @f(%input: memref<2x3xf32>) -> memref<2x3xf32> {
  %c0 = index.constant 0
  %c1 = index.constant 1
  %c2 = index.constant 2
  %c3 = index.constant 3

  %output = memref.alloc() : memref<2x3xf32>
  scf.parallel (%index) = (%c0) to (%c2) step (%c1) {
    // Create subviews for working input and output slices
    %input_slice = memref.subview %input[%index, 2][1, 3][1, -1] : memref<2x3xf32> to memref<1x3xf32, strided<[3, -1], offset: ?>>
    %output_slice = memref.subview %output[%index, 0][1, 3][1, 1] : memref<2x3xf32> to memref<1x3xf32, strided<[3, 1], offset: ?>>

    // Copy the input slice into this temporary buffer. This intermediate
    // copy is unnecessary, but is used for illustration purposes.
    %temp = memref.alloc() : memref<1x3xf32>
    memref.copy %input_slice, %temp : memref<1x3xf32, strided<[3, -1], offset: ?>> to memref<1x3xf32>

    // Copy temporary buffer into output slice
    memref.copy %temp, %output_slice : memref<1x3xf32> to memref<1x3xf32, strided<[3, 1], offset: ?>>
    scf.reduce
  }

  return %output : memref<2x3xf32>
}
```

The patch submitted here prevents `%temp = memref.alloc() :
memref<1x3xf32>` from being hoisted when the containing op is
`scf.parallel` or `scf.forall`. A new op trait called
`HasParallelRegion` is introduced and assigned to these two ops to
indicate that their regions have parallel execution semantics.

@joker-eph @ftynse @nicolasvasilache @sabauma

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
    mlir/include/mlir/Interfaces/LoopLikeInterface.h
    mlir/include/mlir/Interfaces/LoopLikeInterface.td
    mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
    mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index b3d085bfff1af9..41a0b67c42a0bf 100644

--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -307,7 +307,8 @@ def ForallOp : SCF_Op<"forall", [
        RecursiveMemoryEffects,
        SingleBlockImplicitTerminator<"scf::InParallelOp">,
        DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-       DestinationStyleOpInterface
+       DestinationStyleOpInterface,
+       HasParallelRegion
      ]> {
   let summary = "evaluate a block multiple times in parallel";
   let description = [{
@@ -764,7 +765,8 @@ def ParallelOp : SCF_Op<"parallel",
           "getSingleLowerBound", "getSingleUpperBound", "getSingleStep"]>,
      RecursiveMemoryEffects,
      DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-     SingleBlockImplicitTerminator<"scf::ReduceOp">]> {
+     SingleBlockImplicitTerminator<"scf::ReduceOp">,
+     HasParallelRegion]> {
   let summary = "parallel for operation";
   let description = [{
     The "scf.parallel" operation represents a loop nest taking 4 groups of SSA

diff  --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.h b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
index 7c7d378d0590ab..42609e824c86ac 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.h
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
@@ -29,8 +29,31 @@ namespace detail {
 /// Verify invariants of the LoopLikeOpInterface.
 LogicalResult verifyLoopLikeOpInterface(Operation *op);
 } // namespace detail
+
+//===----------------------------------------------------------------------===//
+// Traits
+//===----------------------------------------------------------------------===//
+
+namespace OpTrait {
+// A trait indicating that the single region contained in the operation has
+// parallel execution semantics. This may have implications in a certain pass.
+// For example, buffer hoisting is illegal in parallel loops, and local buffers
+// may be accessed by parallel threads simultaneously.
+template <typename ConcreteType>
+class HasParallelRegion : public TraitBase<ConcreteType, HasParallelRegion> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOneRegion(op);
+  }
+};
+
+} // namespace OpTrait
 } // namespace mlir
 
+//===----------------------------------------------------------------------===//
+// Interfaces
+//===----------------------------------------------------------------------===//
+
 /// Include the generated interface declarations.
 #include "mlir/Interfaces/LoopLikeInterface.h.inc"
 

diff  --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.td b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
index e2ac85a3f7725d..f0dc6e60eba584 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.td
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
@@ -15,6 +15,10 @@
 
 include "mlir/IR/OpBase.td"
 
+//===----------------------------------------------------------------------===//
+// Interfaces
+//===----------------------------------------------------------------------===//
+
 def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
   let description = [{
     Contains helper functions to query properties and perform transformations
@@ -371,4 +375,11 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Traits
+//===----------------------------------------------------------------------===//
+
+// Op contains a region with parallel execution semantics
+def HasParallelRegion : NativeOpTrait<"HasParallelRegion">;
+
 #endif // MLIR_INTERFACES_LOOPLIKEINTERFACE

diff  --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
index 9dc2f262a51161..d7056f35cbc8d5 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
@@ -59,6 +59,12 @@ static bool isLoop(Operation *op) {
   return regionInterface.hasLoop();
 }
 
+/// Return whether the given operation is a loop with sequential execution
+/// semantics.
+static bool isSequentialLoop(Operation *op) {
+  return !op->hasTrait<OpTrait::HasParallelRegion>() && isLoop(op);
+}
+
 /// Returns true if the given operation implements the AllocationOpInterface
 /// and it supports the dominate block hoisting.
 static bool allowAllocDominateBlockHoisting(Operation *op) {
@@ -338,12 +344,13 @@ struct BufferAllocationLoopHoistingState : BufferAllocationHoistingStateBase {
     return dependencyBlock ? dependencyBlock : nullptr;
   }
 
-  /// Returns true if the given operation represents a loop and one of the
-  /// aliases caused the `aliasDominatorBlock` to be "above" the block of the
-  /// given loop operation. If this is the case, it indicates that the
-  /// allocation is passed via a back edge.
+  /// Returns true if the given operation represents a loop with sequential
+  /// execution semantics and one of the aliases caused the
+  /// `aliasDominatorBlock` to be "above" the block of the given loop operation.
+  /// If this is the case, it indicates that the allocation is passed via a back
+  /// edge.
   bool isLegalPlacement(Operation *op) {
-    return isLoop(op) &&
+    return isSequentialLoop(op) &&
            !dominators->dominates(aliasDominatorBlock, op->getBlock());
   }
 

diff  --git a/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir b/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir
index 5f8bc18b64da1e..ee7571b2c84669 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir
@@ -461,6 +461,38 @@ func.func @partial_hoist_multiple_loop_dependency(
 
 // -----
 
+// CHECK-LABEL: func @no_hoist_parallel
+func.func @no_hoist_parallel(
+    %lb: index,
+    %ub: index,
+    %step: index) {
+  scf.parallel (%i) = (%lb) to (%ub) step (%step) {
+      %0 = memref.alloc() : memref<2xf32>
+      scf.reduce
+  }
+  return
+}
+
+//      CHECK: memref.alloc
+// CHECK-NEXT: scf.reduce
+
+// -----
+
+func.func @no_hoist_forall(
+    %lb: index,
+    %ub: index,
+    %step: index) {
+  scf.forall (%i) = (%lb) to (%ub) step (%step) {
+      %1 = memref.alloc() : memref<2xf32>
+  }
+  return
+}
+
+//      CHECK: scf.forall
+// CHECK-NEXT: memref.alloc
+
+// -----
+
 // Test with allocas to ensure that op is also considered.
 
 // CHECK-LABEL: func @hoist_alloca