[Mlir-commits] [mlir] [mlir] Fix region simplification bug when later blocks use prior block argument values (PR #97960)

Mon Aug 12 08:19:22 PDT 2024

https://github.com/bmhowe23 updated https://github.com/llvm/llvm-project/pull/97960

>From 62d10da62b04d70c482f5c89e5ac540a249e2a96 Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Sun, 7 Jul 2024 17:01:15 +0000
Subject: [PATCH 1/3] [mlir][test] Add test for issue #94520

---
 .../test-region-simplification.mlir           | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir

diff --git a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
new file mode 100644
index 00000000000000..f425c47addfcb3
--- /dev/null
+++ b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
@@ -0,0 +1,49 @@
+// Baseline check
+// RUN: mlir-opt %s --convert-func-to-llvm --convert-cf-to-llvm | \
+// RUN: mlir-cpu-runner -e nested_loop --entry-point-result=i32 | FileCheck %s
+
+// Region simplification check
+// RUN: mlir-opt %s \
+// RUN: --canonicalize='enable-patterns=AnyPattern region-simplify=aggressive' \
+// RUN: --convert-func-to-llvm --convert-cf-to-llvm | mlir-cpu-runner \
+// RUN: -e nested_loop --entry-point-result=i32 | FileCheck %s
+
+func.func @nested_loop() -> i32 {
+  %c3_i64 = arith.constant 3 : i64
+  %c2_i64 = arith.constant 2 : i64
+  %c0_i64 = arith.constant 0 : i64
+  %c1_i64 = arith.constant 1 : i64
+  %c1_i32 = arith.constant 1 : i32
+  %c0_i32 = arith.constant 0 : i32
+  cf.br ^bb1(%c0_i32, %c0_i64 : i32, i64)
+^bb1(%0: i32, %1: i64):  // 2 preds: ^bb0, ^bb8
+  %2 = arith.cmpi ult, %1, %c2_i64 : i64
+  cf.cond_br %2, ^bb2(%0, %1 : i32, i64), ^bb9(%0, %1 : i32, i64)
+^bb2(%3: i32, %4: i64):  // pred: ^bb1
+  %5 = arith.addi %4, %c1_i64 : i64
+  cf.br ^bb3(%3, %5 : i32, i64)
+^bb3(%6: i32, %7: i64):  // 2 preds: ^bb2, ^bb5
+  %8 = arith.cmpi ult, %7, %c3_i64 : i64
+  cf.cond_br %8, ^bb4(%6, %7 : i32, i64), ^bb6(%6, %7 : i32, i64)
+^bb4(%9: i32, %10: i64):  // pred: ^bb3
+  %11 = arith.addi %9, %c1_i32 : i32
+  cf.br ^bb5(%11, %10 : i32, i64)
+^bb5(%12: i32, %13: i64):  // pred: ^bb4
+  %14 = arith.addi %13, %c1_i64 : i64
+  cf.br ^bb3(%12, %14 : i32, i64)
+^bb6(%15: i32, %16: i64):  // pred: ^bb3
+  cf.br ^bb7
+^bb7:  // pred: ^bb6
+  cf.br ^bb8(%15, %4 : i32, i64)
+^bb8(%17: i32, %18: i64):  // pred: ^bb7
+  %19 = arith.addi %18, %c1_i64 : i64
+  cf.br ^bb1(%17, %19 : i32, i64)
+^bb9(%20: i32, %21: i64):  // pred: ^bb1
+  cf.br ^bb10
+^bb10:  // pred: ^bb9
+  return %20 : i32
+}
+
+// If region simplification behaves correctly (by NOT merging ^bb2 and ^bb5),
+// this will be 3.
+// CHECK: 3

>From c18db8e175a0a5d8e11402cf941ab582654dcd18 Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Sun, 7 Jul 2024 17:01:15 +0000
Subject: [PATCH 2/3] [mlir] Fix region simplification bug

---
 mlir/lib/Transforms/Utils/RegionUtils.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
index 4c0f15bafbaba3..4d70a2817deeac 100644
--- a/mlir/lib/Transforms/Utils/RegionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -778,6 +778,15 @@ static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter,
       if (hasNonEmptyRegion)
         continue;
 
+      // Don't allow merging if this block's arguments are used outside of the
+      // original block.
+      bool argHasExternalUsers = llvm::any_of(
+          block->getArguments(), [block](mlir::BlockArgument &arg) {
+            return arg.isUsedOutsideOfBlock(block);
+          });
+      if (argHasExternalUsers)
+        continue;
+
       // Try to add this block to an existing cluster.
       bool addedToCluster = false;
       for (auto &cluster : clusters)

>From 0413135054d82415c72a58317d1a979e30c3498b Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe at nvidia.com>
Date: Mon, 12 Aug 2024 15:04:50 +0000
Subject: [PATCH 3/3] Update test

---
 .../test-region-simplification.mlir           | 54 ++++++++-----------
 1 file changed, 23 insertions(+), 31 deletions(-)

diff --git a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
index f425c47addfcb3..cb22bd6d1f5938 100644
--- a/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
+++ b/mlir/test/Integration/Dialect/ControlFlow/test-region-simplification.mlir
@@ -9,39 +9,31 @@
 // RUN: -e nested_loop --entry-point-result=i32 | FileCheck %s
 
 func.func @nested_loop() -> i32 {
-  %c3_i64 = arith.constant 3 : i64
-  %c2_i64 = arith.constant 2 : i64
-  %c0_i64 = arith.constant 0 : i64
-  %c1_i64 = arith.constant 1 : i64
-  %c1_i32 = arith.constant 1 : i32
   %c0_i32 = arith.constant 0 : i32
-  cf.br ^bb1(%c0_i32, %c0_i64 : i32, i64)
-^bb1(%0: i32, %1: i64):  // 2 preds: ^bb0, ^bb8
-  %2 = arith.cmpi ult, %1, %c2_i64 : i64
-  cf.cond_br %2, ^bb2(%0, %1 : i32, i64), ^bb9(%0, %1 : i32, i64)
-^bb2(%3: i32, %4: i64):  // pred: ^bb1
-  %5 = arith.addi %4, %c1_i64 : i64
-  cf.br ^bb3(%3, %5 : i32, i64)
-^bb3(%6: i32, %7: i64):  // 2 preds: ^bb2, ^bb5
-  %8 = arith.cmpi ult, %7, %c3_i64 : i64
-  cf.cond_br %8, ^bb4(%6, %7 : i32, i64), ^bb6(%6, %7 : i32, i64)
-^bb4(%9: i32, %10: i64):  // pred: ^bb3
+  %c1_i32 = arith.constant 1 : i32
+  %c2_i32 = arith.constant 2 : i32
+  %c3_i32 = arith.constant 3 : i32
+  cf.br ^bb1(%c0_i32, %c0_i32 : i32, i32)
+^bb1(%0: i32, %1: i32):
+  %2 = arith.cmpi ult, %1, %c2_i32 : i32
+  cf.cond_br %2, ^bb2(%0, %1 : i32, i32), ^bb7(%0 : i32)
+^bb2(%3: i32, %4: i32):
+  %5 = arith.addi %4, %c1_i32 : i32
+  cf.br ^bb3(%3, %5 : i32, i32)
+^bb3(%6: i32, %7: i32):
+  %8 = arith.cmpi ult, %7, %c3_i32 : i32
+  cf.cond_br %8, ^bb4(%6, %7 : i32, i32), ^bb6(%6, %4 : i32, i32)
+^bb4(%9: i32, %10: i32):
   %11 = arith.addi %9, %c1_i32 : i32
-  cf.br ^bb5(%11, %10 : i32, i64)
-^bb5(%12: i32, %13: i64):  // pred: ^bb4
-  %14 = arith.addi %13, %c1_i64 : i64
-  cf.br ^bb3(%12, %14 : i32, i64)
-^bb6(%15: i32, %16: i64):  // pred: ^bb3
-  cf.br ^bb7
-^bb7:  // pred: ^bb6
-  cf.br ^bb8(%15, %4 : i32, i64)
-^bb8(%17: i32, %18: i64):  // pred: ^bb7
-  %19 = arith.addi %18, %c1_i64 : i64
-  cf.br ^bb1(%17, %19 : i32, i64)
-^bb9(%20: i32, %21: i64):  // pred: ^bb1
-  cf.br ^bb10
-^bb10:  // pred: ^bb9
-  return %20 : i32
+  cf.br ^bb5(%11, %10 : i32, i32)
+^bb5(%12: i32, %13: i32):
+  %14 = arith.addi %13, %c1_i32 : i32
+  cf.br ^bb3(%12, %14 : i32, i32)
+^bb6(%15: i32, %16: i32):
+  %17 = arith.addi %16, %c1_i32 : i32
+  cf.br ^bb1(%15, %17 : i32, i32)
+^bb7(%18: i32):
+  return %18 : i32
 }
 
 // If region simplification behaves correctly (by NOT merging ^bb2 and ^bb5),