[Mlir-commits] [mlir] [mlir] Fix region simplification bug when later blocks use prior block argument values (PR #97960)
Ben Howe
llvmlistbot at llvm.org
Tue Sep 3 19:19:07 PDT 2024
https://github.com/bmhowe23 updated https://github.com/llvm/llvm-project/pull/97960
>From 62d10da62b04d70c482f5c89e5ac540a249e2a96 Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Sun, 7 Jul 2024 17:01:15 +0000
Subject: [PATCH 1/6] [mlir][test] Add test for issue #94520
---
.../test-region-simplification.mlir | 49 +++++++++++++++++++
1 file changed, 49 insertions(+)
create mode 100644 mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
diff --git a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
new file mode 100644
index 00000000000000..f425c47addfcb3
--- /dev/null
+++ b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
@@ -0,0 +1,49 @@
+// Baseline check
+// RUN: mlir-opt %s --convert-func-to-llvm --convert-cf-to-llvm | \
+// RUN: mlir-cpu-runner -e nested_loop --entry-point-result=i32 | FileCheck %s
+
+// Region simplification check
+// RUN: mlir-opt %s \
+// RUN: --canonicalize='enable-patterns=AnyPattern region-simplify=aggressive' \
+// RUN: --convert-func-to-llvm --convert-cf-to-llvm | mlir-cpu-runner \
+// RUN: -e nested_loop --entry-point-result=i32 | FileCheck %s
+
+func.func @nested_loop() -> i32 {
+ %c3_i64 = arith.constant 3 : i64
+ %c2_i64 = arith.constant 2 : i64
+ %c0_i64 = arith.constant 0 : i64
+ %c1_i64 = arith.constant 1 : i64
+ %c1_i32 = arith.constant 1 : i32
+ %c0_i32 = arith.constant 0 : i32
+ cf.br ^bb1(%c0_i32, %c0_i64 : i32, i64)
+^bb1(%0: i32, %1: i64): // 2 preds: ^bb0, ^bb8
+ %2 = arith.cmpi ult, %1, %c2_i64 : i64
+ cf.cond_br %2, ^bb2(%0, %1 : i32, i64), ^bb9(%0, %1 : i32, i64)
+^bb2(%3: i32, %4: i64): // pred: ^bb1
+ %5 = arith.addi %4, %c1_i64 : i64
+ cf.br ^bb3(%3, %5 : i32, i64)
+^bb3(%6: i32, %7: i64): // 2 preds: ^bb2, ^bb5
+ %8 = arith.cmpi ult, %7, %c3_i64 : i64
+ cf.cond_br %8, ^bb4(%6, %7 : i32, i64), ^bb6(%6, %7 : i32, i64)
+^bb4(%9: i32, %10: i64): // pred: ^bb3
+ %11 = arith.addi %9, %c1_i32 : i32
+ cf.br ^bb5(%11, %10 : i32, i64)
+^bb5(%12: i32, %13: i64): // pred: ^bb4
+ %14 = arith.addi %13, %c1_i64 : i64
+ cf.br ^bb3(%12, %14 : i32, i64)
+^bb6(%15: i32, %16: i64): // pred: ^bb3
+ cf.br ^bb7
+^bb7: // pred: ^bb6
+ cf.br ^bb8(%15, %4 : i32, i64)
+^bb8(%17: i32, %18: i64): // pred: ^bb7
+ %19 = arith.addi %18, %c1_i64 : i64
+ cf.br ^bb1(%17, %19 : i32, i64)
+^bb9(%20: i32, %21: i64): // pred: ^bb1
+ cf.br ^bb10
+^bb10: // pred: ^bb9
+ return %20 : i32
+}
+
+// If region simplification behaves correctly (by NOT merging ^bb2 and ^bb5),
+// this will be 3.
+// CHECK: 3
>From c18db8e175a0a5d8e11402cf941ab582654dcd18 Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Sun, 7 Jul 2024 17:01:15 +0000
Subject: [PATCH 2/6] [mlir] Fix region simplification bug
---
mlir/lib/Transforms/Utils/RegionUtils.cpp | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
index 4c0f15bafbaba3..4d70a2817deeac 100644
--- a/mlir/lib/Transforms/Utils/RegionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -778,6 +778,15 @@ static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter,
if (hasNonEmptyRegion)
continue;
+ // Don't allow merging if this block's arguments are used outside of the
+ // original block.
+ bool argHasExternalUsers = llvm::any_of(
+ block->getArguments(), [block](mlir::BlockArgument &arg) {
+ return arg.isUsedOutsideOfBlock(block);
+ });
+ if (argHasExternalUsers)
+ continue;
+
// Try to add this block to an existing cluster.
bool addedToCluster = false;
for (auto &cluster : clusters)
>From 0413135054d82415c72a58317d1a979e30c3498b Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Mon, 12 Aug 2024 15:04:50 +0000
Subject: [PATCH 3/6] Update test
---
.../test-region-simplification.mlir | 54 ++++++++-----------
1 file changed, 23 insertions(+), 31 deletions(-)
diff --git a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
index f425c47addfcb3..cb22bd6d1f5938 100644
--- a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
+++ b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
@@ -9,39 +9,31 @@
// RUN: -e nested_loop --entry-point-result=i32 | FileCheck %s
func.func @nested_loop() -> i32 {
- %c3_i64 = arith.constant 3 : i64
- %c2_i64 = arith.constant 2 : i64
- %c0_i64 = arith.constant 0 : i64
- %c1_i64 = arith.constant 1 : i64
- %c1_i32 = arith.constant 1 : i32
%c0_i32 = arith.constant 0 : i32
- cf.br ^bb1(%c0_i32, %c0_i64 : i32, i64)
-^bb1(%0: i32, %1: i64): // 2 preds: ^bb0, ^bb8
- %2 = arith.cmpi ult, %1, %c2_i64 : i64
- cf.cond_br %2, ^bb2(%0, %1 : i32, i64), ^bb9(%0, %1 : i32, i64)
-^bb2(%3: i32, %4: i64): // pred: ^bb1
- %5 = arith.addi %4, %c1_i64 : i64
- cf.br ^bb3(%3, %5 : i32, i64)
-^bb3(%6: i32, %7: i64): // 2 preds: ^bb2, ^bb5
- %8 = arith.cmpi ult, %7, %c3_i64 : i64
- cf.cond_br %8, ^bb4(%6, %7 : i32, i64), ^bb6(%6, %7 : i32, i64)
-^bb4(%9: i32, %10: i64): // pred: ^bb3
+ %c1_i32 = arith.constant 1 : i32
+ %c2_i32 = arith.constant 2 : i32
+ %c3_i32 = arith.constant 3 : i32
+ cf.br ^bb1(%c0_i32, %c0_i32 : i32, i32)
+^bb1(%0: i32, %1: i32):
+ %2 = arith.cmpi ult, %1, %c2_i32 : i32
+ cf.cond_br %2, ^bb2(%0, %1 : i32, i32), ^bb7(%0 : i32)
+^bb2(%3: i32, %4: i32):
+ %5 = arith.addi %4, %c1_i32 : i32
+ cf.br ^bb3(%3, %5 : i32, i32)
+^bb3(%6: i32, %7: i32):
+ %8 = arith.cmpi ult, %7, %c3_i32 : i32
+ cf.cond_br %8, ^bb4(%6, %7 : i32, i32), ^bb6(%6, %4 : i32, i32)
+^bb4(%9: i32, %10: i32):
%11 = arith.addi %9, %c1_i32 : i32
- cf.br ^bb5(%11, %10 : i32, i64)
-^bb5(%12: i32, %13: i64): // pred: ^bb4
- %14 = arith.addi %13, %c1_i64 : i64
- cf.br ^bb3(%12, %14 : i32, i64)
-^bb6(%15: i32, %16: i64): // pred: ^bb3
- cf.br ^bb7
-^bb7: // pred: ^bb6
- cf.br ^bb8(%15, %4 : i32, i64)
-^bb8(%17: i32, %18: i64): // pred: ^bb7
- %19 = arith.addi %18, %c1_i64 : i64
- cf.br ^bb1(%17, %19 : i32, i64)
-^bb9(%20: i32, %21: i64): // pred: ^bb1
- cf.br ^bb10
-^bb10: // pred: ^bb9
- return %20 : i32
+ cf.br ^bb5(%11, %10 : i32, i32)
+^bb5(%12: i32, %13: i32):
+ %14 = arith.addi %13, %c1_i32 : i32
+ cf.br ^bb3(%12, %14 : i32, i32)
+^bb6(%15: i32, %16: i32):
+ %17 = arith.addi %16, %c1_i32 : i32
+ cf.br ^bb1(%15, %17 : i32, i32)
+^bb7(%18: i32):
+ return %18 : i32
}
// If region simplification behaves correctly (by NOT merging ^bb2 and ^bb5),
>From 5b3ee9ffc8e5e5a8d8113316d328a2d6b8eb01d5 Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Tue, 20 Aug 2024 23:36:55 +0000
Subject: [PATCH 4/6] Replace integration test with unit test
---
.../test-region-simplification.mlir | 41 -------------------
.../test-simplify-without-canonicalize.mlir | 33 +++++++++++++++
2 files changed, 33 insertions(+), 41 deletions(-)
delete mode 100644 mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
create mode 100644 mlir/test/Transforms/test-simplify-without-canonicalize.mlir
diff --git a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
deleted file mode 100644
index cb22bd6d1f5938..00000000000000
--- a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
+++ /dev/null
@@ -1,41 +0,0 @@
-// Baseline check
-// RUN: mlir-opt %s --convert-func-to-llvm --convert-cf-to-llvm | \
-// RUN: mlir-cpu-runner -e nested_loop --entry-point-result=i32 | FileCheck %s
-
-// Region simplification check
-// RUN: mlir-opt %s \
-// RUN: --canonicalize='enable-patterns=AnyPattern region-simplify=aggressive' \
-// RUN: --convert-func-to-llvm --convert-cf-to-llvm | mlir-cpu-runner \
-// RUN: -e nested_loop --entry-point-result=i32 | FileCheck %s
-
-func.func @nested_loop() -> i32 {
- %c0_i32 = arith.constant 0 : i32
- %c1_i32 = arith.constant 1 : i32
- %c2_i32 = arith.constant 2 : i32
- %c3_i32 = arith.constant 3 : i32
- cf.br ^bb1(%c0_i32, %c0_i32 : i32, i32)
-^bb1(%0: i32, %1: i32):
- %2 = arith.cmpi ult, %1, %c2_i32 : i32
- cf.cond_br %2, ^bb2(%0, %1 : i32, i32), ^bb7(%0 : i32)
-^bb2(%3: i32, %4: i32):
- %5 = arith.addi %4, %c1_i32 : i32
- cf.br ^bb3(%3, %5 : i32, i32)
-^bb3(%6: i32, %7: i32):
- %8 = arith.cmpi ult, %7, %c3_i32 : i32
- cf.cond_br %8, ^bb4(%6, %7 : i32, i32), ^bb6(%6, %4 : i32, i32)
-^bb4(%9: i32, %10: i32):
- %11 = arith.addi %9, %c1_i32 : i32
- cf.br ^bb5(%11, %10 : i32, i32)
-^bb5(%12: i32, %13: i32):
- %14 = arith.addi %13, %c1_i32 : i32
- cf.br ^bb3(%12, %14 : i32, i32)
-^bb6(%15: i32, %16: i32):
- %17 = arith.addi %16, %c1_i32 : i32
- cf.br ^bb1(%15, %17 : i32, i32)
-^bb7(%18: i32):
- return %18 : i32
-}
-
-// If region simplification behaves correctly (by NOT merging ^bb2 and ^bb5),
-// this will be 3.
-// CHECK: 3
diff --git a/mlir/test/Transforms/test-simplify-without-canonicalize.mlir b/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
new file mode 100644
index 00000000000000..3144cd021d3564
--- /dev/null
+++ b/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-opt --canonicalize='enable-patterns=AnyPattern region-simplify=aggressive' %s | FileCheck %s
+
+// Perform merge checks without performing canonicalization prior to simplification
+
+// This test should not merge ^bb2 and ^bb5, despite the fact that they are
+// identical because %4 is used outside of ^bb2.
+func.func @nested_loop(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) -> i32 {
+ cf.br ^bb1(%arg0, %arg0 : i32, i32)
+^bb1(%0: i32, %1: i32):
+ %2 = "test.foo"(%1, %arg2) : (i32, i32) -> i1
+ cf.cond_br %2, ^bb2(%0, %1 : i32, i32), ^bb7(%0 : i32)
+^bb2(%3: i32, %4: i32):
+ %5 = "test.foo"(%4, %arg1) : (i32, i32) -> i32
+ cf.br ^bb3(%3, %5 : i32, i32)
+^bb3(%6: i32, %7: i32):
+ %8 = "test.foo"(%7, %arg3) : (i32, i32) -> i1
+ cf.cond_br %8, ^bb4(%6, %7 : i32, i32), ^bb6(%6, %4 : i32, i32)
+^bb4(%9: i32, %10: i32):
+ %11 = "test.foo"(%9, %arg1) : (i32, i32) -> i32
+ cf.br ^bb5(%11, %10 : i32, i32)
+^bb5(%12: i32, %13: i32):
+ %14 = "test.foo"(%13, %arg1) : (i32, i32) -> i32
+ cf.br ^bb3(%12, %14 : i32, i32)
+^bb6(%15: i32, %16: i32):
+ %17 = arith.addi %16, %arg1 : i32
+ cf.br ^bb1(%15, %17 : i32, i32)
+^bb7(%18: i32):
+ return %18 : i32
+}
+
+// CHECK-LABEL: func.func @nested_loop
+// CHECK: ^bb1(%{{.*}}: i32, %[[BB1_ARG1:.*]]: i32):
+// CHECK: arith.addi %[[BB1_ARG1]]
>From bf79bb13215014ca234df219b9fe0b912c95e526 Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Wed, 21 Aug 2024 13:25:17 +0000
Subject: [PATCH 5/6] Add comma so that %1 does not match items like %10
---
mlir/test/Transforms/test-simplify-without-canonicalize.mlir | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/test/Transforms/test-simplify-without-canonicalize.mlir b/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
index 3144cd021d3564..f810eaecd56f28 100644
--- a/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
+++ b/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
@@ -30,4 +30,4 @@ func.func @nested_loop(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) -> i3
// CHECK-LABEL: func.func @nested_loop
// CHECK: ^bb1(%{{.*}}: i32, %[[BB1_ARG1:.*]]: i32):
-// CHECK: arith.addi %[[BB1_ARG1]]
+// CHECK: arith.addi %[[BB1_ARG1]],
>From 395aeacf9ab973856279af5b0231f86f75c3cc2a Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Wed, 4 Sep 2024 02:18:09 +0000
Subject: [PATCH 6/6] Change test
---
.../Transforms/canonicalize-block-merge.mlir | 28 ++++++++++++++++
.../test-simplify-without-canonicalize.mlir | 33 -------------------
2 files changed, 28 insertions(+), 33 deletions(-)
delete mode 100644 mlir/test/Transforms/test-simplify-without-canonicalize.mlir
diff --git a/mlir/test/Transforms/canonicalize-block-merge.mlir b/mlir/test/Transforms/canonicalize-block-merge.mlir
index 92cfde817cf7ff..6dbfcb562e588b 100644
--- a/mlir/test/Transforms/canonicalize-block-merge.mlir
+++ b/mlir/test/Transforms/canonicalize-block-merge.mlir
@@ -290,3 +290,31 @@ func.func @dead_dealloc_fold_multi_use(%cond : i1) {
memref.dealloc %a: memref<4xf32>
return
}
+
+// CHECK-LABEL: func @nested_loop
+func.func @nested_loop(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i1) {
+// Irreducible control-flow: enter the middle of the loop in LoopBody_entry here.
+ "test.foo_br"(%arg0, %arg4)[^LoopBody_entry] : (i32, i32) -> ()
+
+// Loop exit condition: jump to exit or LoobBody blocks
+^Loop_header: // 2 preds: ^bb2, ^bb3
+ // Consumes the block arg from LoopBody_entry
+ // Because of this use here, we can't merge the two blocks below.
+ "test.foo_br2"(%0)[^EXIT, ^LoopBody_entry, ^LoopBody_other] : (i32) -> ()
+
+// LoopBody_entry is jumped in from the entry block (bb0) and Loop_header
+// It **dominates** the Loop_header.
+^LoopBody_entry(%0: i32): // 2 preds: ^bb0, ^Loop_header
+ // CHECK: test.bar
+ %1 = "test.bar"(%0) : (i32) -> i32
+ cf.br ^Loop_header
+
+// Other block inside the loop, not dominating the header
+^LoopBody_other(%2: i32): // pred: ^Loop_header
+ // CHECK: test.bar
+ %3 = "test.bar"(%2) : (i32) -> i32
+ cf.br ^Loop_header
+
+^EXIT: // pred: ^Loop_header
+ return
+}
diff --git a/mlir/test/Transforms/test-simplify-without-canonicalize.mlir b/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
deleted file mode 100644
index f810eaecd56f28..00000000000000
--- a/mlir/test/Transforms/test-simplify-without-canonicalize.mlir
+++ /dev/null
@@ -1,33 +0,0 @@
-// RUN: mlir-opt --canonicalize='enable-patterns=AnyPattern region-simplify=aggressive' %s | FileCheck %s
-
-// Perform merge checks without performing canonicalization prior to simplification
-
-// This test should not merge ^bb2 and ^bb5, despite the fact that they are
-// identical because %4 is used outside of ^bb2.
-func.func @nested_loop(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) -> i32 {
- cf.br ^bb1(%arg0, %arg0 : i32, i32)
-^bb1(%0: i32, %1: i32):
- %2 = "test.foo"(%1, %arg2) : (i32, i32) -> i1
- cf.cond_br %2, ^bb2(%0, %1 : i32, i32), ^bb7(%0 : i32)
-^bb2(%3: i32, %4: i32):
- %5 = "test.foo"(%4, %arg1) : (i32, i32) -> i32
- cf.br ^bb3(%3, %5 : i32, i32)
-^bb3(%6: i32, %7: i32):
- %8 = "test.foo"(%7, %arg3) : (i32, i32) -> i1
- cf.cond_br %8, ^bb4(%6, %7 : i32, i32), ^bb6(%6, %4 : i32, i32)
-^bb4(%9: i32, %10: i32):
- %11 = "test.foo"(%9, %arg1) : (i32, i32) -> i32
- cf.br ^bb5(%11, %10 : i32, i32)
-^bb5(%12: i32, %13: i32):
- %14 = "test.foo"(%13, %arg1) : (i32, i32) -> i32
- cf.br ^bb3(%12, %14 : i32, i32)
-^bb6(%15: i32, %16: i32):
- %17 = arith.addi %16, %arg1 : i32
- cf.br ^bb1(%15, %17 : i32, i32)
-^bb7(%18: i32):
- return %18 : i32
-}
-
-// CHECK-LABEL: func.func @nested_loop
-// CHECK: ^bb1(%{{.*}}: i32, %[[BB1_ARG1:.*]]: i32):
-// CHECK: arith.addi %[[BB1_ARG1]],
More information about the Mlir-commits
mailing list