[Mlir-commits] [mlir] [mlir][bufferization] Adding the optimize-allocation-liveness pass (PR #101827)

Tue Aug 13 23:50:15 PDT 2024

https://github.com/DennisFily updated https://github.com/llvm/llvm-project/pull/101827

>From ddd1d367f3e8ceff9909717b02a579bdd8e335c5 Mon Sep 17 00:00:00 2001
From: "Dennis.Filimonov" <dennis.filimonov at gmail.com>
Date: Fri, 2 Aug 2024 23:31:02 +0300
Subject: [PATCH] Adding the optimize-allocation-liveness pass

This commit will add a pass that is expected to run after the
deallocation pipeline and will move buffer deallocations right after their
last user, thus optimizing the allocation liveness.
---
 .../Dialect/Bufferization/Transforms/Passes.h |   6 +
 .../Bufferization/Transforms/Passes.td        |  17 ++
 .../Bufferization/Transforms/CMakeLists.txt   |   1 +
 .../Transforms/OptimizeAllocationLiveness.cpp | 159 ++++++++++++++++
 .../optimize-allocation-liveness.mlir         | 178 ++++++++++++++++++
 5 files changed, 361 insertions(+)
 create mode 100644 mlir/lib/Dialect/Bufferization/Transforms/OptimizeAllocationLiveness.cpp
 create mode 100644 mlir/test/Dialect/Bufferization/Transforms/optimize-allocation-liveness.mlir

diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index c12ed7f5d0180b..18896fc75ff58a 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -37,6 +37,12 @@ std::unique_ptr<Pass> createBufferDeallocationPass();
 std::unique_ptr<Pass> createOwnershipBasedBufferDeallocationPass(
     DeallocationOptions options = DeallocationOptions());
 
+/// Creates a pass that finds all temp allocations,
+/// and attempts to move the deallocation after the last user/dependency 
+/// of the allocation. Thus, optimizing the allocation liveness.
+/// The pass is expected to run after the deallocaion pipeline.
+std::unique_ptr<Pass> createOptimizeAllocationLivenessPass();
+
 /// Creates a pass that optimizes `bufferization.dealloc` operations. For
 /// example, it reduces the number of alias checks needed at runtime using
 /// static alias analysis.
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index 1cece818dbbbc3..cccea0f7e4ffdb 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -252,6 +252,23 @@ def BufferDeallocationSimplification :
   ];
 }
 
+def OptimizeAllocationLiveness
+    : Pass<"optimize-allocation-liveness", "func::FuncOp"> {
+  let summary = "This pass optimizes the liveness of temp allocations in the "
+                "input function";
+  let description =
+      [{This pass will go over all the allocations that also have deallocations
+                in the same block i.e.temp buffers.And finds the last user /
+            dependency of each allocation,
+        it attempts to move the deallocation right after that last
+            user.This will optimize liveness of the allocations to the minimum
+                .The pass is expected to run after the deallocating pipeline,
+        which places all deallocation at the end of the function.}];
+  let constructor =
+      "mlir::bufferization::createOptimizeAllocationLivenessPass()";
+  let dependentDialects = ["mlir::memref::MemRefDialect"];
+}
+
 def LowerDeallocations : Pass<"bufferization-lower-deallocations"> {
   let summary = "Lowers `bufferization.dealloc` operations to `memref.dealloc`"
                 "operations";
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt
index 8617c17e7a5e5e..f27d924416677a 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt
@@ -15,6 +15,7 @@ add_mlir_dialect_library(MLIRBufferizationTransforms
   OneShotModuleBufferize.cpp
   OwnershipBasedBufferDeallocation.cpp
   TensorCopyInsertion.cpp
+  OptimizeAllocationLiveness.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Bufferization
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OptimizeAllocationLiveness.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OptimizeAllocationLiveness.cpp
new file mode 100644
index 00000000000000..a420df7b7ca657
--- /dev/null
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OptimizeAllocationLiveness.cpp
@@ -0,0 +1,159 @@
+//===- OptimizeAllocationLiveness.cpp - impl. optimize location of temp buffer
+// deallocations ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an algorithem for optimization of allocation liveness,
+// The algorithm moves the dealloc op to right after the last user of the
+// allocation and on the same block as the allocation.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "optimize-allocation-liveness"
+#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+namespace mlir {
+namespace bufferization {
+#define GEN_PASS_DEF_OPTIMIZEALLOCATIONLIVENESS
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h.inc"
+} // namespace bufferization
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+/// Return true if `a` happens before `b`, i.e., `a` or one of its ancestors
+/// properly dominates `b` and `b` is not inside `a`.
+static bool happensBefore(Operation *a, Operation *b) {
+  do {
+    if (a->isProperAncestor(b))
+      return false;
+    if (Operation *bAncestor = a->getBlock()->findAncestorOpInBlock(*b)) {
+      return a->isBeforeInBlock(bAncestor);
+    }
+  } while ((a = a->getParentOp()));
+  return false;
+}
+
+/// This method searches for a user of value that is a dealloc operation.
+/// If multiple users with free effect are found, return nullptr.
+Operation *findUserWithFreeSideEffect(Value value) {
+  Operation *freeOpUser = nullptr;
+  for (Operation *user : value.getUsers()) {
+    if (MemoryEffectOpInterface memEffectOp =
+            dyn_cast<MemoryEffectOpInterface>(user)) {
+      SmallVector<MemoryEffects::EffectInstance, 2> effects;
+      memEffectOp.getEffects(effects);
+
+      for (const auto &effect : effects) {
+        if (isa<MemoryEffects::Free>(effect.getEffect())) {
+          if (freeOpUser) {
+            LDBG("Multiple users with free effect found: " << *freeOpUser
+                                                           << " and " << *user);
+            return nullptr;
+          }
+          freeOpUser = user;
+        }
+      }
+    }
+  }
+  return freeOpUser;
+}
+
+/// Checks if the given op allocates memory.
+static bool hasMemoryAllocEffect(MemoryEffectOpInterface memEffectOp) {
+  SmallVector<MemoryEffects::EffectInstance, 2> effects;
+  memEffectOp.getEffects(effects);
+  for (const auto &effect : effects) {
+    if (isa<MemoryEffects::Allocate>(effect.getEffect())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+struct OptimizeAllocationLiveness
+    : public bufferization::impl::OptimizeAllocationLivenessBase<
+          OptimizeAllocationLiveness> {
+public:
+  OptimizeAllocationLiveness() = default;
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+
+    if (func.isExternal())
+      return;
+
+    BufferViewFlowAnalysis analysis = BufferViewFlowAnalysis(func);
+
+    func.walk([&](MemoryEffectOpInterface memEffectOp) -> WalkResult {
+      if (!hasMemoryAllocEffect(memEffectOp))
+        return WalkResult::advance();
+
+      auto allocOp = memEffectOp;
+      LDBG("Checking alloc op: " << allocOp);
+
+      auto deallocOp = findUserWithFreeSideEffect(allocOp->getResult(0));
+      if (!deallocOp)
+        return WalkResult::advance();
+
+      Operation *lastUser = nullptr;
+      const BufferViewFlowAnalysis::ValueSetT &deps =
+          analysis.resolve(allocOp->getResult(0));
+      for (auto dep : llvm::make_early_inc_range(deps)) {
+        for (auto user : dep.getUsers()) {
+          // We are looking for a non dealloc op user.
+          // check if user is the dealloc op itself.
+          if (user == deallocOp)
+            continue;
+
+          // find the ancestor of user that is in the same block as the allocOp.
+          auto topUser = allocOp->getBlock()->findAncestorOpInBlock(*user);
+          if (!lastUser || happensBefore(lastUser, topUser)) {
+            lastUser = topUser;
+          }
+        }
+      }
+      if (lastUser == nullptr) {
+        return WalkResult::advance();
+      }
+      LDBG("Last user found: " << *lastUser);
+      assert(lastUser->getBlock() == allocOp->getBlock());
+      assert(lastUser->getBlock() == deallocOp->getBlock());
+      // Move the dealloc op after the last user.
+      deallocOp->moveAfter(lastUser);
+      LDBG("Moved dealloc op after: " << *lastUser);
+
+      return WalkResult::advance();
+    });
+  }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// OptimizeAllocatinliveness construction
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<Pass>
+mlir::bufferization::createOptimizeAllocationLivenessPass() {
+  return std::make_unique<OptimizeAllocationLiveness>();
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/optimize-allocation-liveness.mlir b/mlir/test/Dialect/Bufferization/Transforms/optimize-allocation-liveness.mlir
new file mode 100644
index 00000000000000..137426190eab9f
--- /dev/null
+++ b/mlir/test/Dialect/Bufferization/Transforms/optimize-allocation-liveness.mlir
@@ -0,0 +1,178 @@
+// RUN: mlir-opt %s --optimize-allocation-liveness --split-input-file | FileCheck %s
+
+// CHECK-LABEL:   func.func private @optimize_alloc_location(
+// CHECK-SAME:                                               %[[VAL_0:.*]]: memref<45x24x256xf32, 1>,
+// CHECK-SAME:                                               %[[VAL_1:.*]]: memref<24x256xf32, 1>,
+// CHECK-SAME:                                               %[[VAL_2:.*]]: memref<256xf32, 1>) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_5:.*]] = memref.expand_shape %[[VAL_4]] {{\[\[}}0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+// CHECK:           memref.dealloc %[[VAL_4]] : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_6:.*]] = memref.alloc() {alignment = 64 : i64} : memref<24x256xf32, 1>
+// CHECK:           %[[VAL_7:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK:           memref.store %[[VAL_7]], %[[VAL_6]]{{\[}}%[[VAL_3]], %[[VAL_3]]] : memref<24x256xf32, 1>
+// CHECK:           memref.dealloc %[[VAL_6]] : memref<24x256xf32, 1>
+// CHECK:           return
+// CHECK:         }
+
+
+// This test will optimize the location of the %alloc deallocation
+func.func private @optimize_alloc_location(%arg0: memref<45x24x256xf32, 1> , %arg1: memref<24x256xf32, 1> , %arg2: memref<256xf32, 1>) -> () {
+  %c1 = arith.constant 1 : index
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+  %expand_shape = memref.expand_shape %alloc [[0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+  %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<24x256xf32, 1>
+  %cf1 = arith.constant 1.0 : f32
+  memref.store %cf1, %alloc_1[%c1, %c1] : memref<24x256xf32, 1>
+  memref.dealloc %alloc : memref<45x6144xf32, 1>
+  memref.dealloc %alloc_1 : memref<24x256xf32, 1>
+  return
+}
+
+// -----
+
+// CHECK-LABEL:   func.func private @test_multiple_deallocation_moves(
+// CHECK-SAME:                                                        %[[VAL_0:.*]]: memref<45x24x256xf32, 1>,
+// CHECK-SAME:                                                        %[[VAL_1:.*]]: memref<24x256xf32, 1>,
+// CHECK-SAME:                                                        %[[VAL_2:.*]]: memref<256xf32, 1>) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_5:.*]] = memref.expand_shape %[[VAL_4]] {{\[\[}}0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+// CHECK:           memref.dealloc %[[VAL_4]] : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_6:.*]] = memref.alloc() {alignment = 64 : i64} : memref<24x256xf32, 1>
+// CHECK:           %[[VAL_7:.*]] = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_8:.*]] = memref.expand_shape %[[VAL_7]] {{\[\[}}0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+// CHECK:           memref.dealloc %[[VAL_7]] : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_9:.*]] = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_10:.*]] = memref.expand_shape %[[VAL_9]] {{\[\[}}0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+// CHECK:           memref.dealloc %[[VAL_9]] : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_11:.*]] = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_12:.*]] = memref.expand_shape %[[VAL_11]] {{\[\[}}0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+// CHECK:           memref.dealloc %[[VAL_11]] : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_13:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK:           memref.store %[[VAL_13]], %[[VAL_6]]{{\[}}%[[VAL_3]], %[[VAL_3]]] : memref<24x256xf32, 1>
+// CHECK:           memref.dealloc %[[VAL_6]] : memref<24x256xf32, 1>
+// CHECK:           return
+// CHECK:         }
+
+
+// This test creates multiple deallocation rearrangements. 
+func.func private @test_multiple_deallocation_moves(%arg0: memref<45x24x256xf32, 1> , %arg1: memref<24x256xf32, 1> , %arg2: memref<256xf32, 1>) -> () {
+  %c1 = arith.constant 1 : index
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+  %expand_shape = memref.expand_shape %alloc [[0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+  %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<24x256xf32, 1>
+  %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+  %expand_shape2 = memref.expand_shape %alloc_2 [[0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+  %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+  %expand_shape3 = memref.expand_shape %alloc_3 [[0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+  %expand_shape4 = memref.expand_shape %alloc_4 [[0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+  %cf1 = arith.constant 1.0 : f32
+  memref.store %cf1, %alloc_1[%c1, %c1] : memref<24x256xf32, 1>
+  memref.dealloc %alloc : memref<45x6144xf32, 1>
+  memref.dealloc %alloc_1 : memref<24x256xf32, 1>
+  memref.dealloc %alloc_2 : memref<45x6144xf32, 1>
+  memref.dealloc %alloc_3 : memref<45x6144xf32, 1>
+  memref.dealloc %alloc_4 : memref<45x6144xf32, 1>
+  return
+}
+
+// -----
+// CHECK-LABEL:   func.func private @test_users_in_different_blocks_linalig_generic(
+// CHECK-SAME:                                                                      %[[VAL_0:.*]]: memref<1x20x20xf32, 1>) -> memref<1x32x32xf32, 1> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x32x32xf32, 1>
+// CHECK:           %[[VAL_4:.*]] = memref.subview %[[VAL_3]][0, 0, 0] [1, 20, 20] [1, 1, 1] : memref<1x32x32xf32, 1> to memref<1x20x20xf32, strided<[1024, 32, 1]>, 1>
+// CHECK:           memref.copy %[[VAL_0]], %[[VAL_4]] : memref<1x20x20xf32, 1> to memref<1x20x20xf32, strided<[1024, 32, 1]>, 1>
+// CHECK:           %[[VAL_5:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x32x32x1xf32, 1>
+// CHECK:           %[[VAL_6:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x8x32x1x4xf32, 1>
+// CHECK:           linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} outs(%[[VAL_6]] : memref<1x8x32x1x4xf32, 1>) {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: f32):
+// CHECK:             %[[VAL_8:.*]] = linalg.index 0 : index
+// CHECK:             %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_8]], %[[VAL_8]], %[[VAL_8]], %[[VAL_2]]] : memref<1x32x32x1xf32, 1>
+// CHECK:             linalg.yield %[[VAL_9]] : f32
+// CHECK:           }
+// CHECK:           memref.dealloc %[[VAL_5]] : memref<1x32x32x1xf32, 1>
+// CHECK:           %[[VAL_10:.*]] = memref.collapse_shape %[[VAL_6]] {{\[\[}}0, 1], [2], [3], [4]] : memref<1x8x32x1x4xf32, 1> into memref<8x32x1x4xf32, 1>
+// CHECK:           memref.dealloc %[[VAL_6]] : memref<1x8x32x1x4xf32, 1>
+// CHECK:           return %[[VAL_3]] : memref<1x32x32xf32, 1>
+// CHECK:         }
+
+
+
+// This test will optimize the location of the %alloc_0 deallocation, since the last user of this allocation is the last linalg.generic operation
+// it will move the deallocation right after the last linalg.generic operation
+// %alloc_1 will not be moved becuase of the collapse shape op.
+func.func private @test_users_in_different_blocks_linalig_generic(%arg0: memref<1x20x20xf32, 1>) -> (memref<1x32x32xf32, 1>) {
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x32x32xf32, 1>
+  %subview = memref.subview %alloc[0, 0, 0] [1, 20, 20] [1, 1, 1] : memref<1x32x32xf32, 1> to memref<1x20x20xf32, strided<[1024, 32, 1]>, 1>
+  memref.copy %arg0, %subview : memref<1x20x20xf32, 1> to memref<1x20x20xf32, strided<[1024, 32, 1]>, 1>
+  %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x32x32x1xf32, 1>
+  %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x8x32x1x4xf32, 1>
+  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} outs(%alloc_1 : memref<1x8x32x1x4xf32, 1>) {
+  ^bb0(%out: f32):
+    %0 = linalg.index 0 : index
+    %8 = memref.load %alloc_0[%0, %0, %0, %c0] : memref<1x32x32x1xf32, 1>
+    linalg.yield %8 : f32
+  }
+  %collapse_shape = memref.collapse_shape %alloc_1 [[0, 1], [2], [3], [4]] : memref<1x8x32x1x4xf32, 1> into memref<8x32x1x4xf32, 1>
+  memref.dealloc %alloc_0 : memref<1x32x32x1xf32, 1>
+  memref.dealloc %alloc_1 : memref<1x8x32x1x4xf32, 1>
+  return %alloc : memref<1x32x32xf32, 1>
+}
+
+// -----
+// CHECK-LABEL:   func.func private @test_deallocs_in_different_block_forops(
+// CHECK-SAME:                                                               %[[VAL_0:.*]]: memref<45x24x256xf32, 1>,
+// CHECK-SAME:                                                               %[[VAL_1:.*]]: memref<24x256xf32, 1>,
+// CHECK-SAME:                                                               %[[VAL_2:.*]]: memref<256xf32, 1>) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 8 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 45 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 24 : index
+// CHECK:           %[[VAL_8:.*]] = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_9:.*]] = memref.expand_shape %[[VAL_8]] {{\[\[}}0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+// CHECK:           %[[VAL_10:.*]] = memref.alloc() {alignment = 64 : i64} : memref<24x256xf32, 1>
+// CHECK:           %[[VAL_11:.*]] = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+// CHECK:           %[[VAL_12:.*]] = memref.expand_shape %[[VAL_11]] {{\[\[}}0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+// CHECK:           memref.dealloc %[[VAL_11]] : memref<45x6144xf32, 1>
+// CHECK:           scf.for %[[VAL_13:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_4]] {
+// CHECK:             scf.for %[[VAL_14:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_5]] {
+// CHECK:               %[[VAL_15:.*]] = memref.subview %[[VAL_9]]{{\[}}%[[VAL_13]], %[[VAL_14]], 0] [1, 8, 256] [1, 1, 1] : memref<45x24x256xf32, 1> to memref<1x8x256xf32, strided<[6144, 256, 1], offset: ?>, 1>
+// CHECK:               %[[VAL_16:.*]] = memref.subview %[[VAL_10]]{{\[}}%[[VAL_14]], 0] [8, 256] [1, 1] : memref<24x256xf32, 1> to memref<8x256xf32, strided<[256, 1], offset: ?>, 1>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           memref.dealloc %[[VAL_10]] : memref<24x256xf32, 1>
+// CHECK:           memref.dealloc %[[VAL_8]] : memref<45x6144xf32, 1>
+// CHECK:           return
+// CHECK:         }
+
+// This test will not move the deallocations %alloc and %alloc1 since they are used in the last scf.for operation
+// %alloc_2 will move right after its last user the expand_shape operation
+func.func private @test_deallocs_in_different_block_forops(%arg0: memref<45x24x256xf32, 1>, %arg1: memref<24x256xf32, 1> , %arg2: memref<256xf32, 1> ) -> () {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c45 = arith.constant 45 : index
+  %c24 = arith.constant 24 : index
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+  %expand_shape = memref.expand_shape %alloc [[0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+  %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<24x256xf32, 1>
+  %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<45x6144xf32, 1>
+  %expand_shape2 = memref.expand_shape %alloc_2 [[0], [1, 2]] output_shape [45, 24, 256] : memref<45x6144xf32, 1> into memref<45x24x256xf32, 1>
+  scf.for %arg3 = %c0 to %c45 step %c1 {
+    scf.for %arg4 = %c0 to %c24 step %c8 {
+      %subview = memref.subview %expand_shape[%arg3, %arg4, 0] [1, 8, 256] [1, 1, 1] : memref<45x24x256xf32, 1> to memref<1x8x256xf32, strided<[6144, 256, 1], offset: ?>, 1>
+      %subview_3 = memref.subview %alloc_1[%arg4, 0] [8, 256] [1, 1] : memref<24x256xf32, 1> to memref<8x256xf32, strided<[256, 1], offset: ?>, 1>    
+    }
+  }
+  memref.dealloc %alloc : memref<45x6144xf32, 1>
+  memref.dealloc %alloc_1 : memref<24x256xf32, 1>
+  memref.dealloc %alloc_2 : memref<45x6144xf32, 1>
+  return
+}