[Mlir-commits] [mlir] [MLIR] Improve in-place folding to iterate until fixed-point (PR #160615)

Wed Sep 24 15:49:28 PDT 2025

https://github.com/joker-eph updated https://github.com/llvm/llvm-project/pull/160615

>From bfd7aff781c49f595f87f40784692b09e32d65e5 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Wed, 24 Sep 2025 15:21:38 -0700
Subject: [PATCH] [MLIR] Improve in-place folding to iterate until fixed-point

When executed in the context of canonicalization, the folders are
invoked in a fixed-point iterative process. However in the context
of an API like `createOrFold()` or in DialectConversion for example,
we expect a "one-shot" call to fold to be as "folded" as possible.
However, even when folders themselves are indempotent, folders on a
given operation interact with each other. For example:

// X = 0 + Y
%X = arith.addi %c_0, %Y : i32

should fold to %Y, but the process actually involves first the folder
provided by the IsCommutative trait to move the constant to the right.
However this happens after attempting to fold the operation and the
operation folder isn't attempt again after applying the trait folder.

This commit makes sure we iterate until fixed point on folder
applications.
---
 mlir/lib/IR/Builders.cpp                      |  7 +++++++
 mlir/lib/Transforms/Utils/FoldUtils.cpp       | 12 +++++++++--
 mlir/test/Dialect/Arith/constant-fold.mlir    | 12 +++++++++++
 .../Dialect/XeGPU/xegpu-attr-interface.mlir   |  8 ++------
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |  6 ++----
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       | 20 +++++++------------
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   | 16 +++++----------
 7 files changed, 45 insertions(+), 36 deletions(-)
 create mode 100644 mlir/test/Dialect/Arith/constant-fold.mlir

diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index 3d366276b4375..6373b1ead94d1 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -14,6 +14,7 @@
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Matchers.h"
 #include "llvm/ADT/SmallVectorExtras.h"
+#include "llvm/Support/DebugLog.h"
 
 using namespace mlir;
 
@@ -489,6 +490,12 @@ OpBuilder::tryFold(Operation *op, SmallVectorImpl<Value> &results,
   if (failed(op->fold(foldResults)))
     return cleanupFailure();
 
+  int count = 0;
+  do {
+    LDBG() << "Folded in place #" << count++
+           << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
+  } while (foldResults.empty() && succeeded(op->fold(foldResults)));
+
   // An in-place fold does not require generation of any constants.
   if (foldResults.empty())
     return success();
diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp
index 5e07509871ea2..70310685315c4 100644
--- a/mlir/lib/Transforms/Utils/FoldUtils.cpp
+++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp
@@ -16,6 +16,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/Operation.h"
+#include "llvm/Support/DebugLog.h"
 
 using namespace mlir;
 
@@ -226,8 +227,15 @@ bool OperationFolder::isFolderOwnedConstant(Operation *op) const {
 LogicalResult OperationFolder::tryToFold(Operation *op,
                                          SmallVectorImpl<Value> &results) {
   SmallVector<OpFoldResult, 8> foldResults;
-  if (failed(op->fold(foldResults)) ||
-      failed(processFoldResults(op, results, foldResults)))
+  if (failed(op->fold(foldResults)))
+    return failure();
+  int count = 1;
+  do {
+    LDBG() << "Folded in place #" << count++
+           << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
+  } while (foldResults.empty() && succeeded(op->fold(foldResults)));
+
+  if (failed(processFoldResults(op, results, foldResults)))
     return failure();
   return success();
 }
diff --git a/mlir/test/Dialect/Arith/constant-fold.mlir b/mlir/test/Dialect/Arith/constant-fold.mlir
new file mode 100644
index 0000000000000..523c654e6af9a
--- /dev/null
+++ b/mlir/test/Dialect/Arith/constant-fold.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(test-single-fold))" | FileCheck %s
+
+
+// CHECK: func @recurse_fold_traits(%[[ARG0:.*]]: i32)
+// CHECK:   return %[[ARG0]] : i32
+// Folding entirely this requires to move the constant to the right
+// before invoking the op-specific folder.
+func.func @recurse_fold_traits(%arg0 : i32) -> i32 {
+  %cst0 = arith.constant 0 : i32
+  %res = arith.addi %cst0, %arg0 : i32
+  return %res : i32
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
index 547c7355e00c6..b73bc69393dab 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -7,10 +7,8 @@ gpu.module @test {
     //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
     //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
     //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
     //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
@@ -23,10 +21,8 @@ gpu.module @test {
     //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
     //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
     //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
     //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index e5cc65e6bd3d7..d2d250cbe0f66 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -27,12 +27,10 @@ gpu.module @test_round_robin_assignment {
     //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]]
     //CHECK: [[C0:%.+]] = arith.constant 0 : index
     //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
-    //CHECK: [[ADDY:%.+]] = arith.addi [[LY]], [[C0]] : index
-    //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
     //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]]
+    //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]]
     //CHECK: [[C64_2:%.+]] = arith.constant 64 : index
-    //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C64_2]]
+    //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]]
     //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 3478a9b91da5f..31d015d89a2dc 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -325,12 +325,10 @@ gpu.module @test_distribution {
     //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
     //CHECK: [[c0:%.+]] = arith.constant 0 : index
     //CHECK: [[c0_1:%.+]] = arith.constant 0 : index
-    //CHECK: [[l_off_y_0:%.+]] = arith.addi [[l_off_y]], [[c0]] : index
-    //CHECK: [[l_off_x_0:%.+]] = arith.addi [[l_off_x]], [[c0_1]] : index
     //CHECK: [[c64:%.+]] = arith.constant 64 : index
-    //CHECK: [[off_y:%.+]] = index.remu [[l_off_y_0]], [[c64]]
+    //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[off_x:%.+]] = index.remu [[l_off_x_0]], [[c128]]
+    //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]]
     //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
     %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32], lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32>
@@ -349,13 +347,11 @@ gpu.module @test_distribution {
     //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]]
     //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_y_0:%.+]] = index.mul [[id_y]], [[c32]]
+    //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]]
     //CHECK: [[c32_1:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_x_0:%.+]] = index.mul [[id_x]], [[c32_1]]
+    //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
     //CHECK: [[c0:%.+]] = arith.constant 0 : index
     //CHECK: [[c0_2:%.+]] = arith.constant 0 : index
-    //CHECK: [[l_off_y:%.+]] = arith.addi [[l_off_y_0]], [[c0]] : index
-    //CHECK: [[l_off_x:%.+]] = arith.addi [[l_off_x_0]], [[c0_2]] : index
     //CHECK: [[c64:%.+]] = arith.constant 64 : index
     //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
@@ -372,11 +368,10 @@ gpu.module @test_distribution {
     //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
     //CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]]
     //CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+    //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]]
     //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
     //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+    //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]]
     //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex>
     //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
@@ -390,9 +385,8 @@ gpu.module @test_distribution {
     //CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index
     //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]]
     //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
     //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+    //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
     //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex>
     //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index c0fb373835e3d..e83229e3a3995 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -14,12 +14,10 @@ gpu.module @test_1_1_assignment {
     //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
     //CHECK: [[C0:%.+]] = arith.constant 0 : index
     //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
-    //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
-    //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
     //CHECK: [[C256:%.+]] = arith.constant 256 : index
-    //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C256]]
+    //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]]
     //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[X:%.+]] = index.remu [[UX]], [[C128]]
+    //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]]
     //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -37,17 +35,13 @@ gpu.module @test_1_1_assignment {
     //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
     //CHECK: [[C0:%.+]] = arith.constant 0 : index
     //CHECK: [[C0_2:%.+]] = arith.constant 0 : index
-    //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
-    //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_2]] : index
     //CHECK: [[C256:%.+]] = arith.constant 256 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[UY]], [[C256]]
+    //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]]
     //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODX:%.+]] = index.remu [[UX]], [[C128]]
+    //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]]
     //CHECK: [[C0_3:%.+]] = arith.constant 0 : index
-    //CHECK: [[Y:%.+]] = index.add [[MODY]], [[C0_3]]
     //CHECK: [[C0_4:%.+]] = arith.constant 0 : index
-    //CHECK: [[X:%.+]] = index.add [[MODX]], [[C0_4]]
-    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[Y]], [[X]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return