[Mlir-commits] [mlir] [mlir] Fix region simplification bug when later blocks use prior block argument values (PR #97960)

Tue Sep 3 19:19:07 PDT 2024

https://github.com/bmhowe23 updated https://github.com/llvm/llvm-project/pull/97960

>From 62d10da62b04d70c482f5c89e5ac540a249e2a96 Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Sun, 7 Jul 2024 17:01:15 +0000
Subject: [PATCH 1/6] [mlir][test] Add test for issue #94520

---
 .../test-region-simplification.mlir           | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir

diff --git a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
new file mode 100644
index 00000000000000..f425c47addfcb3
--- /dev/null
+++ b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
@@ -0,0 +1,49 @@
+// Baseline check
+// RUN: mlir-opt %s --convert-func-to-llvm --convert-cf-to-llvm | \
+// RUN: mlir-cpu-runner -e nested_loop --entry-point-result=i32 | FileCheck %s
+
+// Region simplification check
+// RUN: mlir-opt %s \
+// RUN: --canonicalize='enable-patterns=AnyPattern region-simplify=aggressive' \
+// RUN: --convert-func-to-llvm --convert-cf-to-llvm | mlir-cpu-runner \
+// RUN: -e nested_loop --entry-point-result=i32 | FileCheck %s
+
+func.func @nested_loop() -> i32 {
+  %c3_i64 = arith.constant 3 : i64
+  %c2_i64 = arith.constant 2 : i64
+  %c0_i64 = arith.constant 0 : i64
+  %c1_i64 = arith.constant 1 : i64
+  %c1_i32 = arith.constant 1 : i32
+  %c0_i32 = arith.constant 0 : i32
+  cf.br ^bb1(%c0_i32, %c0_i64 : i32, i64)
+^bb1(%0: i32, %1: i64):  // 2 preds: ^bb0, ^bb8
+  %2 = arith.cmpi ult, %1, %c2_i64 : i64
+  cf.cond_br %2, ^bb2(%0, %1 : i32, i64), ^bb9(%0, %1 : i32, i64)
+^bb2(%3: i32, %4: i64):  // pred: ^bb1
+  %5 = arith.addi %4, %c1_i64 : i64
+  cf.br ^bb3(%3, %5 : i32, i64)
+^bb3(%6: i32, %7: i64):  // 2 preds: ^bb2, ^bb5
+  %8 = arith.cmpi ult, %7, %c3_i64 : i64
+  cf.cond_br %8, ^bb4(%6, %7 : i32, i64), ^bb6(%6, %7 : i32, i64)
+^bb4(%9: i32, %10: i64):  // pred: ^bb3
+  %11 = arith.addi %9, %c1_i32 : i32
+  cf.br ^bb5(%11, %10 : i32, i64)
+^bb5(%12: i32, %13: i64):  // pred: ^bb4
+  %14 = arith.addi %13, %c1_i64 : i64
+  cf.br ^bb3(%12, %14 : i32, i64)
+^bb6(%15: i32, %16: i64):  // pred: ^bb3
+  cf.br ^bb7
+^bb7:  // pred: ^bb6
+  cf.br ^bb8(%15, %4 : i32, i64)
+^bb8(%17: i32, %18: i64):  // pred: ^bb7
+  %19 = arith.addi %18, %c1_i64 : i64
+  cf.br ^bb1(%17, %19 : i32, i64)
+^bb9(%20: i32, %21: i64):  // pred: ^bb1
+  cf.br ^bb10
+^bb10:  // pred: ^bb9
+  return %20 : i32
+}
+
+// If region simplification behaves correctly (by NOT merging ^bb2 and ^bb5),
+// this will be 3.
+// CHECK: 3

>From c18db8e175a0a5d8e11402cf941ab582654dcd18 Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Sun, 7 Jul 2024 17:01:15 +0000
Subject: [PATCH 2/6] [mlir] Fix region simplification bug

---
 mlir/lib/Transforms/Utils/RegionUtils.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
index 4c0f15bafbaba3..4d70a2817deeac 100644
--- a/mlir/lib/Transforms/Utils/RegionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -778,6 +778,15 @@ static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter,
       if (hasNonEmptyRegion)
         continue;
 
+      // Don't allow merging if this block's arguments are used outside of the
+      // original block.
+      bool argHasExternalUsers = llvm::any_of(
+          block->getArguments(), [block](mlir::BlockArgument &arg) {
+            return arg.isUsedOutsideOfBlock(block);
+          });
+      if (argHasExternalUsers)
+        continue;
+
       // Try to add this block to an existing cluster.
       bool addedToCluster = false;
       for (auto &cluster : clusters)

>From 0413135054d82415c72a58317d1a979e30c3498b Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Mon, 12 Aug 2024 15:04:50 +0000
Subject: [PATCH 3/6] Update test

---
 .../test-region-simplification.mlir           | 54 ++++++++-----------
 1 file changed, 23 insertions(+), 31 deletions(-)

diff --git a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
index f425c47addfcb3..cb22bd6d1f5938 100644
--- a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
+++ b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
@@ -9,39 +9,31 @@
 // RUN: -e nested_loop --entry-point-result=i32 | FileCheck %s
 
 func.func @nested_loop() -> i32 {
-  %c3_i64 = arith.constant 3 : i64
-  %c2_i64 = arith.constant 2 : i64
-  %c0_i64 = arith.constant 0 : i64
-  %c1_i64 = arith.constant 1 : i64
-  %c1_i32 = arith.constant 1 : i32
   %c0_i32 = arith.constant 0 : i32
-  cf.br ^bb1(%c0_i32, %c0_i64 : i32, i64)
-^bb1(%0: i32, %1: i64):  // 2 preds: ^bb0, ^bb8
-  %2 = arith.cmpi ult, %1, %c2_i64 : i64
-  cf.cond_br %2, ^bb2(%0, %1 : i32, i64), ^bb9(%0, %1 : i32, i64)
-^bb2(%3: i32, %4: i64):  // pred: ^bb1
-  %5 = arith.addi %4, %c1_i64 : i64
-  cf.br ^bb3(%3, %5 : i32, i64)
-^bb3(%6: i32, %7: i64):  // 2 preds: ^bb2, ^bb5
-  %8 = arith.cmpi ult, %7, %c3_i64 : i64
-  cf.cond_br %8, ^bb4(%6, %7 : i32, i64), ^bb6(%6, %7 : i32, i64)
-^bb4(%9: i32, %10: i64):  // pred: ^bb3
+  %c1_i32 = arith.constant 1 : i32
+  %c2_i32 = arith.constant 2 : i32
+  %c3_i32 = arith.constant 3 : i32
+  cf.br ^bb1(%c0_i32, %c0_i32 : i32, i32)
+^bb1(%0: i32, %1: i32):
+  %2 = arith.cmpi ult, %1, %c2_i32 : i32
+  cf.cond_br %2, ^bb2(%0, %1 : i32, i32), ^bb7(%0 : i32)
+^bb2(%3: i32, %4: i32):
+  %5 = arith.addi %4, %c1_i32 : i32
+  cf.br ^bb3(%3, %5 : i32, i32)
+^bb3(%6: i32, %7: i32):
+  %8 = arith.cmpi ult, %7, %c3_i32 : i32
+  cf.cond_br %8, ^bb4(%6, %7 : i32, i32), ^bb6(%6, %4 : i32, i32)
+^bb4(%9: i32, %10: i32):
   %11 = arith.addi %9, %c1_i32 : i32
-  cf.br ^bb5(%11, %10 : i32, i64)
-^bb5(%12: i32, %13: i64):  // pred: ^bb4
-  %14 = arith.addi %13, %c1_i64 : i64
-  cf.br ^bb3(%12, %14 : i32, i64)
-^bb6(%15: i32, %16: i64):  // pred: ^bb3
-  cf.br ^bb7
-^bb7:  // pred: ^bb6
-  cf.br ^bb8(%15, %4 : i32, i64)
-^bb8(%17: i32, %18: i64):  // pred: ^bb7
-  %19 = arith.addi %18, %c1_i64 : i64
-  cf.br ^bb1(%17, %19 : i32, i64)
-^bb9(%20: i32, %21: i64):  // pred: ^bb1
-  cf.br ^bb10
-^bb10:  // pred: ^bb9
-  return %20 : i32
+  cf.br ^bb5(%11, %10 : i32, i32)
+^bb5(%12: i32, %13: i32):
+  %14 = arith.addi %13, %c1_i32 : i32
+  cf.br ^bb3(%12, %14 : i32, i32)
+^bb6(%15: i32, %16: i32):
+  %17 = arith.addi %16, %c1_i32 : i32
+  cf.br ^bb1(%15, %17 : i32, i32)
+^bb7(%18: i32):
+  return %18 : i32
 }
 
 // If region simplification behaves correctly (by NOT merging ^bb2 and ^bb5),

>From 5b3ee9ffc8e5e5a8d8113316d328a2d6b8eb01d5 Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Tue, 20 Aug 2024 23:36:55 +0000
Subject: [PATCH 4/6] Replace integration test with unit test

---
 .../test-region-simplification.mlir           | 41 -------------------
 .../test-simplify-without-canonicalize.mlir   | 33 +++++++++++++++
 2 files changed, 33 insertions(+), 41 deletions(-)
 delete mode 100644 mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
 create mode 100644 mlir/test/Transforms/test-simplify-without-canonicalize.mlir

diff --git a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
deleted file mode 100644
index cb22bd6d1f5938..00000000000000
--- a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
+++ /dev/null
@@ -1,41 +0,0 @@
-// Baseline check
-// RUN: mlir-opt %s --convert-func-to-llvm --convert-cf-to-llvm | \
-// RUN: mlir-cpu-runner -e nested_loop --entry-point-result=i32 | FileCheck %s
-
-// Region simplification check
-// RUN: mlir-opt %s \
-// RUN: --canonicalize='enable-patterns=AnyPattern region-simplify=aggressive' \
-// RUN: --convert-func-to-llvm --convert-cf-to-llvm | mlir-cpu-runner \
-// RUN: -e nested_loop --entry-point-result=i32 | FileCheck %s
-
-func.func @nested_loop() -> i32 {
-  %c0_i32 = arith.constant 0 : i32
-  %c1_i32 = arith.constant 1 : i32
-  %c2_i32 = arith.constant 2 : i32
-  %c3_i32 = arith.constant 3 : i32
-  cf.br ^bb1(%c0_i32, %c0_i32 : i32, i32)
-^bb1(%0: i32, %1: i32):
-  %2 = arith.cmpi ult, %1, %c2_i32 : i32
-  cf.cond_br %2, ^bb2(%0, %1 : i32, i32), ^bb7(%0 : i32)
-^bb2(%3: i32, %4: i32):
-  %5 = arith.addi %4, %c1_i32 : i32
-  cf.br ^bb3(%3, %5 : i32, i32)
-^bb3(%6: i32, %7: i32):
-  %8 = arith.cmpi ult, %7, %c3_i32 : i32
-  cf.cond_br %8, ^bb4(%6, %7 : i32, i32), ^bb6(%6, %4 : i32, i32)
-^bb4(%9: i32, %10: i32):
-  %11 = arith.addi %9, %c1_i32 : i32
-  cf.br ^bb5(%11, %10 : i32, i32)
-^bb5(%12: i32, %13: i32):
-  %14 = arith.addi %13, %c1_i32 : i32
-  cf.br ^bb3(%12, %14 : i32, i32)
-^bb6(%15: i32, %16: i32):
-  %17 = arith.addi %16, %c1_i32 : i32
-  cf.br ^bb1(%15, %17 : i32, i32)
-^bb7(%18: i32):
-  return %18 : i32
-}
-
-// If region simplification behaves correctly (by NOT merging ^bb2 and ^bb5),
-// this will be 3.
-// CHECK: 3
diff --git a/mlir/test/Transforms/test-simplify-without-canonicalize.mlir b/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
new file mode 100644
index 00000000000000..3144cd021d3564
--- /dev/null
+++ b/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-opt --canonicalize='enable-patterns=AnyPattern region-simplify=aggressive' %s | FileCheck %s
+
+// Perform merge checks without performing canonicalization prior to simplification
+
+// This test should not merge ^bb2 and ^bb5, despite the fact that they are
+// identical because %4 is used outside of ^bb2.
+func.func @nested_loop(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) -> i32 {
+  cf.br ^bb1(%arg0, %arg0 : i32, i32)
+^bb1(%0: i32, %1: i32):
+  %2 = "test.foo"(%1, %arg2) : (i32, i32) -> i1
+  cf.cond_br %2, ^bb2(%0, %1 : i32, i32), ^bb7(%0 : i32)
+^bb2(%3: i32, %4: i32):
+  %5 = "test.foo"(%4, %arg1) : (i32, i32) -> i32
+  cf.br ^bb3(%3, %5 : i32, i32)
+^bb3(%6: i32, %7: i32):
+  %8 = "test.foo"(%7, %arg3) : (i32, i32) -> i1
+  cf.cond_br %8, ^bb4(%6, %7 : i32, i32), ^bb6(%6, %4 : i32, i32)
+^bb4(%9: i32, %10: i32):
+  %11 = "test.foo"(%9, %arg1) : (i32, i32) -> i32
+  cf.br ^bb5(%11, %10 : i32, i32)
+^bb5(%12: i32, %13: i32):
+  %14 = "test.foo"(%13, %arg1) : (i32, i32) -> i32
+  cf.br ^bb3(%12, %14 : i32, i32)
+^bb6(%15: i32, %16: i32):
+  %17 = arith.addi %16, %arg1 : i32
+  cf.br ^bb1(%15, %17 : i32, i32)
+^bb7(%18: i32):
+  return %18 : i32
+}
+
+// CHECK-LABEL:   func.func @nested_loop
+// CHECK:         ^bb1(%{{.*}}: i32, %[[BB1_ARG1:.*]]: i32):
+// CHECK:           arith.addi %[[BB1_ARG1]]

>From bf79bb13215014ca234df219b9fe0b912c95e526 Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Wed, 21 Aug 2024 13:25:17 +0000
Subject: [PATCH 5/6] Add comma so that %1 does not match items like %10

---
 mlir/test/Transforms/test-simplify-without-canonicalize.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Transforms/test-simplify-without-canonicalize.mlir b/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
index 3144cd021d3564..f810eaecd56f28 100644
--- a/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
+++ b/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
@@ -30,4 +30,4 @@ func.func @nested_loop(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) -> i3
 
 // CHECK-LABEL:   func.func @nested_loop
 // CHECK:         ^bb1(%{{.*}}: i32, %[[BB1_ARG1:.*]]: i32):
-// CHECK:           arith.addi %[[BB1_ARG1]]
+// CHECK:           arith.addi %[[BB1_ARG1]],

>From 395aeacf9ab973856279af5b0231f86f75c3cc2a Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Wed, 4 Sep 2024 02:18:09 +0000
Subject: [PATCH 6/6] Change test

---
 .../Transforms/canonicalize-block-merge.mlir  | 28 ++++++++++++++++
 .../test-simplify-without-canonicalize.mlir   | 33 -------------------
 2 files changed, 28 insertions(+), 33 deletions(-)
 delete mode 100644 mlir/test/Transforms/test-simplify-without-canonicalize.mlir

diff --git a/mlir/test/Transforms/canonicalize-block-merge.mlir b/mlir/test/Transforms/canonicalize-block-merge.mlir
index 92cfde817cf7ff..6dbfcb562e588b 100644
--- a/mlir/test/Transforms/canonicalize-block-merge.mlir
+++ b/mlir/test/Transforms/canonicalize-block-merge.mlir
@@ -290,3 +290,31 @@ func.func @dead_dealloc_fold_multi_use(%cond : i1) {
   memref.dealloc %a: memref<4xf32>
   return
 }
+
+// CHECK-LABEL: func @nested_loop
+func.func @nested_loop(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i1) {
+// Irreducible control-flow: enter the middle of the loop in LoopBody_entry here.
+  "test.foo_br"(%arg0, %arg4)[^LoopBody_entry] : (i32, i32) -> ()
+
+// Loop exit condition: jump to exit or LoobBody blocks
+^Loop_header:  // 2 preds: ^bb2, ^bb3
+  // Consumes the block arg from LoopBody_entry
+  // Because of this use here, we can't merge the two blocks below.
+  "test.foo_br2"(%0)[^EXIT, ^LoopBody_entry, ^LoopBody_other] : (i32) -> ()
+
+// LoopBody_entry is jumped in from the entry block (bb0) and Loop_header
+// It **dominates** the Loop_header.
+^LoopBody_entry(%0: i32):  // 2 preds: ^bb0, ^Loop_header
+ // CHECK: test.bar
+  %1 = "test.bar"(%0) : (i32) -> i32
+  cf.br ^Loop_header
+
+// Other block inside the loop, not dominating the header
+^LoopBody_other(%2: i32):  // pred: ^Loop_header
+ // CHECK: test.bar
+  %3 = "test.bar"(%2) : (i32) -> i32
+  cf.br ^Loop_header
+
+^EXIT:  // pred: ^Loop_header
+  return
+}
diff --git a/mlir/test/Transforms/test-simplify-without-canonicalize.mlir b/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
deleted file mode 100644
index f810eaecd56f28..00000000000000
--- a/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
+++ /dev/null
@@ -1,33 +0,0 @@
-// RUN: mlir-opt --canonicalize='enable-patterns=AnyPattern region-simplify=aggressive' %s | FileCheck %s
-
-// Perform merge checks without performing canonicalization prior to simplification
-
-// This test should not merge ^bb2 and ^bb5, despite the fact that they are
-// identical because %4 is used outside of ^bb2.
-func.func @nested_loop(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) -> i32 {
-  cf.br ^bb1(%arg0, %arg0 : i32, i32)
-^bb1(%0: i32, %1: i32):
-  %2 = "test.foo"(%1, %arg2) : (i32, i32) -> i1
-  cf.cond_br %2, ^bb2(%0, %1 : i32, i32), ^bb7(%0 : i32)
-^bb2(%3: i32, %4: i32):
-  %5 = "test.foo"(%4, %arg1) : (i32, i32) -> i32
-  cf.br ^bb3(%3, %5 : i32, i32)
-^bb3(%6: i32, %7: i32):
-  %8 = "test.foo"(%7, %arg3) : (i32, i32) -> i1
-  cf.cond_br %8, ^bb4(%6, %7 : i32, i32), ^bb6(%6, %4 : i32, i32)
-^bb4(%9: i32, %10: i32):
-  %11 = "test.foo"(%9, %arg1) : (i32, i32) -> i32
-  cf.br ^bb5(%11, %10 : i32, i32)
-^bb5(%12: i32, %13: i32):
-  %14 = "test.foo"(%13, %arg1) : (i32, i32) -> i32
-  cf.br ^bb3(%12, %14 : i32, i32)
-^bb6(%15: i32, %16: i32):
-  %17 = arith.addi %16, %arg1 : i32
-  cf.br ^bb1(%15, %17 : i32, i32)
-^bb7(%18: i32):
-  return %18 : i32
-}
-
-// CHECK-LABEL:   func.func @nested_loop
-// CHECK:         ^bb1(%{{.*}}: i32, %[[BB1_ARG1:.*]]: i32):
-// CHECK:           arith.addi %[[BB1_ARG1]],