[Mlir-commits] [mlir] [mlir][linalg] Optional dealloc insertion for bufferize_to_allocation (PR #65610)

Thu Sep 7 07:24:36 PDT 2023

https://github.com/maerhart created https://github.com/llvm/llvm-project/pull/65610:

This commit allows to omit insertion of the memref.dealloc operation when linalg.structured.bufferize_to_allocation is run and makes this the default behavior. This is desirable when the buffer-deallocation-pipeline is run after bufferization to handle buffer deallocation.

>From 497802628f7cb73c5aae59d86518573d296de09a Mon Sep 17 00:00:00 2001
From: Martin Erhart <merhart at google.com>
Date: Thu, 7 Sep 2023 14:09:33 +0000
Subject: [PATCH] [mlir][linalg] Optinal dealloc insertion for
 bufferize_to_allocation

This commit allows to omit insertion of the memref.dealloc operation when
linalg.structured.bufferize_to_allocation is run and makes this the default
behavior. This is desirable when the buffer-deallocation-pipeline is run after
bufferization to handle buffer deallocation.
---
 .../Linalg/TransformOps/LinalgTransformOps.td | 13 +++++---
 .../Dialect/Linalg/Transforms/Transforms.h    |  6 ++++
 .../TransformOps/LinalgTransformOps.cpp       |  1 +
 .../Transforms/ConvertToDestinationStyle.cpp  |  8 +++--
 .../Linalg/matmul-shared-memory-padding.mlir  |  4 +--
 .../Linalg/pad-to-specific-memory-space.mlir  |  4 +--
 .../transform-op-bufferize-to-allocation.mlir | 31 ++++++++++++++-----
 7 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index ee6e12f72b80bab..ecfa37d35138c83 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -129,9 +129,13 @@ def BufferizeToAllocationOp : Op<Transform_Dialect,
     "memref.tensor_store" when possible.
 
     "memref.alloc" is used for new buffer allocations. The buffer is deallocated
-    at the end of the block. Custom allocation ops can be specified via
-    `alloc_op`. Currently supported are "memref.alloc" and "memref.alloca". In
-    case of a "memref.alloca", the buffer is not deallocated.
+    at the end of the block if the "emit_dealloc" attribute is present. If this
+    attribute is not present, the allocated memory will be leaked. However,
+    running the `-buffer-deallocation-pipeline` after all bufferization is done
+    will properly insert the corresponding deallocation(s). Custom allocation
+    ops can be specified via `alloc_op`. Currently supported are "memref.alloc"
+    and "memref.alloca". In case of a "memref.alloca", the buffer is not
+    deallocated.
 
     If `bufferize_destination_only` is set, only the destination operands of the
     op are bufferized to a new memory allocation, but not the op itself.
@@ -148,7 +152,8 @@ def BufferizeToAllocationOp : Op<Transform_Dialect,
                            $memcpy_op,
                        DefaultValuedAttr<StrAttr, "\"memref.alloc\"">:
                            $alloc_op,
-                       UnitAttr:$bufferize_destination_only);
+                       UnitAttr:$bufferize_destination_only,
+                       UnitAttr:$emit_dealloc);
   let results = (outs Transform_AnyValue:$allocated_buffer,
                       Transform_AnyOpType:$new_ops);
   let assemblyFormat = "$target attr-dict `:` type($target)";
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index fd82c67ede5fa97..0a3597d35f0eaa9 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -57,6 +57,12 @@ struct BufferizeToAllocationOptions {
   /// a new allocation (and wrapped in "bufferization.to_tensor"), but not the
   /// targeted op itself.
   bool bufferizeDestinationOnly = false;
+
+  /// If set to "true", a `memref.dealloc` operation will be emitted for each
+  /// allocated buffer. Otherwise, the memory is leaked, which is useful if
+  /// the buffer deallocation pipeline should be run after bufferization is
+  /// done.
+  bool emitDealloc = false;
 };
 
 /// Materialize a buffer allocation for the given tensor.pad op and lower the
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 3421a3c169dbba1..a3e264aee00f169 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -263,6 +263,7 @@ DiagnosedSilenceableFailure transform::BufferizeToAllocationOp::apply(
     llvm_unreachable("invalid alloc op");
   }
   options.bufferizeDestinationOnly = getBufferizeDestinationOnly();
+  options.emitDealloc = getEmitDealloc();
 
   // Bufferize ops.
   Attribute memorySpace =
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
index 0c559db60ee88d2..f7340844f7e1977 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
@@ -202,9 +202,11 @@ createAllocationForTensor(RewriterBase &rewriter, Location loc, Value value,
   if (options.allocOp ==
       linalg::BufferizeToAllocationOptions::AllocOp::MemrefAlloc) {
     alloc = rewriter.create<memref::AllocOp>(loc, memrefType, dynamicSizes);
-    // Place deallocation at the end of the block.
-    rewriter.setInsertionPoint(rewriter.getInsertionBlock()->getTerminator());
-    rewriter.create<memref::DeallocOp>(loc, alloc);
+    if (options.emitDealloc) {
+      // Place deallocation at the end of the block.
+      rewriter.setInsertionPoint(rewriter.getInsertionBlock()->getTerminator());
+      rewriter.create<memref::DeallocOp>(loc, alloc);
+    }
   } else if (options.allocOp ==
              linalg::BufferizeToAllocationOptions::AllocOp::MemrefAlloca) {
     alloc = rewriter.create<memref::AllocaOp>(loc, memrefType, dynamicSizes);
diff --git a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
index a809939e7539eef..da6ebdbd24ded48 100644
--- a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
+++ b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
@@ -85,7 +85,7 @@ transform.sequence failures(propagate) {
 
   // Assign shared memory buffer to padding.
   %buffer, %new_ops = transform.structured.bufferize_to_allocation
-      %pad_forall_op {memory_space = 3, bufferize_destination_only}
+      %pad_forall_op {memory_space = 3, bufferize_destination_only, emit_dealloc}
       : !transform.any_op
 
   // Bufferize.
@@ -197,7 +197,7 @@ transform.sequence failures(propagate) {
 
   // Assign shared memory buffer to padding.
   %buffer, %new_ops = transform.structured.bufferize_to_allocation
-      %pad_forall_op {memory_space = 3, bufferize_destination_only}
+      %pad_forall_op {memory_space = 3, bufferize_destination_only, emit_dealloc}
       : !transform.any_op
 
   // Bufferize.
diff --git a/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir b/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
index 5b20627e19dc209..45c2eb5dfdf5022 100644
--- a/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
+++ b/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
@@ -54,7 +54,7 @@ transform.sequence failures(propagate) {
     padding_dimensions=[0, 1, 2],
     pack_paddings=[1, 1, 1]
   } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-  %buffer, %new_ops = transform.structured.bufferize_to_allocation %pad {memory_space = 3} : !transform.any_op
+  %buffer, %new_ops = transform.structured.bufferize_to_allocation %pad {memory_space = 3, emit_dealloc} : !transform.any_op
   %2 = transform.bufferization.one_shot_bufferize %arg1 {bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op
 
 }
@@ -114,6 +114,6 @@ transform.sequence failures(propagate) {
   transform.structured.masked_vectorize %pad vector_sizes [10, 12] : !transform.any_op
   %vector_write = transform.structured.match ops{["vector.transfer_write"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   %mask_op = transform.get_parent_op %vector_write {op_name = "vector.mask"} : (!transform.any_op) -> !transform.any_op
-  %buffer, %new_ops = transform.structured.bufferize_to_allocation %mask_op {memory_space = 3} : !transform.any_op
+  %buffer, %new_ops = transform.structured.bufferize_to_allocation %mask_op {memory_space = 3, emit_dealloc} : !transform.any_op
   %2 = transform.bufferization.one_shot_bufferize %arg1 {bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op
 }
diff --git a/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir b/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir
index ddb953237b008d8..7dee14f22df5d08 100644
--- a/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir
@@ -32,7 +32,7 @@ func.func @tensor_pad_constant(%t: tensor<?x10xindex>, %l2: index, %h1: index,
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %2, %new = transform.structured.bufferize_to_allocation %0 : !transform.any_op
+  %2, %new = transform.structured.bufferize_to_allocation %0 {emit_dealloc} : !transform.any_op
 
   // Ensure that one linalg.fill was generated.
   %fill_op = transform.select "linalg.fill" in %new : (!transform.any_op) -> !transform.any_op
@@ -67,7 +67,7 @@ func.func @tensor_pad_constant_with_custom_copy(
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 3, alloc_op = "memref.alloca", memcpy_op = "linalg.copy"}: !transform.any_op
+  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 3, alloc_op = "memref.alloca", memcpy_op = "linalg.copy", emit_dealloc}: !transform.any_op
 
   // Ensure that one linalg.fill was generated.
   %fill_op = transform.select "linalg.fill" in %new : (!transform.any_op) -> !transform.any_op
@@ -110,7 +110,7 @@ func.func @tensor_pad_constant(%t: tensor<?x10xindex>, %l2: index, %h1: index,
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %2, %new = transform.structured.bufferize_to_allocation %0 : !transform.any_op
+  %2, %new = transform.structured.bufferize_to_allocation %0 {emit_dealloc} : !transform.any_op
   // Make sure that One-Shot Bufferize can bufferize the rest.
   %4 = transform.bufferization.one_shot_bufferize %arg1 : (!transform.any_op) -> !transform.any_op
 }
@@ -134,7 +134,7 @@ func.func @tensor_insert(%t: tensor<?x10xindex>, %idx: index, %v: index) -> tens
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["tensor.insert"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4} : !transform.any_op
+  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4, emit_dealloc} : !transform.any_op
   // Make sure that One-Shot Bufferize can bufferize the rest.
   %4 = transform.bufferization.one_shot_bufferize %arg1 : (!transform.any_op) -> !transform.any_op
 }
@@ -157,7 +157,7 @@ func.func @tensor_insert_into_empty(%idx: index, %v: index) -> tensor<10xindex>
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["tensor.insert"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4} : !transform.any_op
+  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4, emit_dealloc} : !transform.any_op
   // Make sure that One-Shot Bufferize can bufferize the rest.
   %4 = transform.bufferization.one_shot_bufferize %arg1 : (!transform.any_op) -> !transform.any_op
 }
@@ -174,7 +174,7 @@ transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["tensor.extract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   // expected-error @below{{failed to bufferize operation}}
-  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4} : !transform.any_op
+  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4, emit_dealloc} : !transform.any_op
 }
 
 // -----
@@ -195,7 +195,7 @@ func.func @vector_mask(%t: tensor<?xf32>, %val: vector<16xf32>, %idx: index, %m0
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["vector.mask"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4} : !transform.any_op
+  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4, emit_dealloc} : !transform.any_op
 }
 
 // -----
@@ -216,7 +216,7 @@ func.func @tensor_insert_destination(%t: tensor<?x10xindex>, %idx: index, %v: in
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["tensor.insert"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4, bufferize_destination_only} : !transform.any_op
+  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4, bufferize_destination_only, emit_dealloc} : !transform.any_op
 }
 
 // -----
@@ -240,5 +240,20 @@ func.func @scf_for_destination(%t: tensor<?x10xindex>, %lb: index, %ub: index, %
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["scf.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4, bufferize_destination_only, emit_dealloc} : !transform.any_op
+}
+
+// -----
+
+// CHECK-LABEL: func @tensor_insert_destination_no_dealloc
+//   CHECK-NOT: dealloc
+func.func @tensor_insert_destination_no_dealloc(%t: tensor<?x10xindex>, %idx: index, %v: index) -> tensor<?x10xindex> {
+  %r = tensor.insert %v into %t[%idx, %idx] : tensor<?x10xindex>
+  return %r : tensor<?x10xindex>
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["tensor.insert"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   %2, %new = transform.structured.bufferize_to_allocation %0 {memory_space = 4, bufferize_destination_only} : !transform.any_op
 }