[Mlir-commits] [mlir] [MLIR][Affine] Fix sibling fusion - missing check (PR #126626)

Wed Feb 12 15:57:14 PST 2025

https://github.com/bondhugula updated https://github.com/llvm/llvm-project/pull/126626

>From 7a03b9fa54d084a4d7cf3142c200357066ca2671 Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday at polymagelabs.com>
Date: Tue, 11 Feb 2025 09:01:16 +0530
Subject: [PATCH] [MLIR][Affine] Fix sibling fusion - missing check

Fix sibling fusion for slice maximality check. Producer-consumer fusion
had this check but not sibling fusion. Sibling fusion shouldn't be
performed if the slice isn't "maximal" (i.e., if it isn't the whole of
the source).

Fixes: https://github.com/llvm/llvm-project/issues/48703
---
 .../Dialect/Affine/Transforms/LoopFusion.cpp  | 41 +++++++++++++------
 mlir/test/Dialect/Affine/loop-fusion-2.mlir   |  2 +
 mlir/test/Dialect/Affine/loop-fusion-4.mlir   | 23 ++++++++---
 mlir/test/Dialect/Affine/loop-fusion.mlir     |  3 ++
 4 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
index b38dd8effe669..74be68943ee98 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -1162,24 +1162,48 @@ struct GreedyFusion {
       }
 
       assert(bestDstLoopDepth > 0 && "Unexpected loop fusion depth");
-      assert(!depthSliceUnions[bestDstLoopDepth - 1].isEmpty() &&
+
+      const ComputationSliceState &bestSlice =
+          depthSliceUnions[bestDstLoopDepth - 1];
+      assert(!bestSlice.isEmpty() &&
              "Fusion depth has no computed slice union");
+
+      // Do not perform sibling fusion if it isn't maximal. We always remove the
+      // sibling node and as such fusion shouldn't be performed if a part of the
+      // slice is used in the destination.
+      auto isMaximal = bestSlice.isMaximal();
+      if (!isMaximal.value_or(false)) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Slice isn't maximal; not performing sibling fusion.\n");
+        continue;
+      }
+
       // Check if source loop is being inserted in the innermost
       // destination loop. Based on this, the fused loop may be optimized
       // further inside `fuseLoops`.
       bool isInnermostInsertion = (bestDstLoopDepth == dstLoopDepthTest);
       // Fuse computation slice of 'sibLoopNest' into 'dstLoopNest'.
-      affine::fuseLoops(sibAffineForOp, dstAffineForOp,
-                        depthSliceUnions[bestDstLoopDepth - 1],
+      affine::fuseLoops(sibAffineForOp, dstAffineForOp, bestSlice,
                         isInnermostInsertion);
 
       auto dstForInst = cast<AffineForOp>(dstNode->op);
       // Update operation position of fused loop nest (if needed).
-      if (insertPointInst != dstForInst) {
+      if (insertPointInst != dstForInst)
         dstForInst->moveBefore(insertPointInst);
-      }
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Fused sibling nest " << sibId << " into destination nest "
+                 << dstNode->id << " at depth " << bestDstLoopDepth << ":\n"
+                 << dstAffineForOp << "\n");
+
       // Update data dependence graph state post fusion.
       updateStateAfterSiblingFusion(sibNode, dstNode);
+
+      // Remove old sibling loop nest.
+      // Get op before we invalidate the MDG node.
+      Operation *op = sibNode->op;
+      mdg->removeNode(sibNode->id);
+      op->erase();
     }
   }
 
@@ -1321,13 +1345,6 @@ struct GreedyFusion {
     mdg->addToNode(dstNode->id, dstLoopCollector.loadOpInsts,
                    dstLoopCollector.storeOpInsts, dstLoopCollector.memrefLoads,
                    dstLoopCollector.memrefStores, dstLoopCollector.memrefFrees);
-    // Remove old sibling loop nest if it no longer has outgoing dependence
-    // edges, and it does not write to a memref which escapes the block.
-    if (mdg->getOutEdgeCount(sibNode->id) == 0) {
-      Operation *op = sibNode->op;
-      mdg->removeNode(sibNode->id);
-      op->erase();
-    }
   }
 
   // Clean up any allocs with no users.
diff --git a/mlir/test/Dialect/Affine/loop-fusion-2.mlir b/mlir/test/Dialect/Affine/loop-fusion-2.mlir
index 8fec24f71b14a..99207e4910462 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-2.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-2.mlir
@@ -389,6 +389,8 @@ func.func @should_fuse_init_loops_siblings_then_shared_producer(%arg0: memref<10
 
 // -----
 
+// Test sibling fusion of two matrix-vector products sharing the input matrix.
+
 func.func @two_matrix_vector_products() {
   %in_matrix = memref.alloc() : memref<10x10xf32>
   %in_vec0 = memref.alloc() : memref<10xf32>
diff --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
index 2830235431c76..813b93a248e4a 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-4.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal mode=sibling}))' -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL
+// All fusion: producer-consumer and sibling.
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s --check-prefix=ALL
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(spirv.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=SPIRV
 
 // Part I of fusion tests in  mlir/test/Transforms/loop-fusion.mlir.
@@ -108,6 +110,7 @@ func.func @check_src_dst_step(%m : memref<100xf32>,
 func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) {
     %cst_0 = arith.constant 0.000000e+00 : f32
     %cst_1 = arith.constant 1.000000e+00 : f32
+    // This nest writes to %arg1 but can be eliminated post sibling fusion.
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 64 {
         %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
@@ -137,11 +140,11 @@ func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : m
 // since the destination loop and source loop trip counts do not
 // match.
 // SIBLING-MAXIMAL:        %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32
-// SIBLING-MAXIMAL-NEXT:        %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
-// SIBLING-MAXIMAL-NEXT:           affine.for %[[idx_0:.*]]= 0 to 1 {
-// SIBLING-MAXIMAL-NEXT:             affine.for %[[idx_1:.*]] = 0 to 64 {
-// SIBLING-MAXIMAL-NEXT:               %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {
-// SIBLING-MAXIMAL-NEXT:                 %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {
+// SIBLING-MAXIMAL-NEXT:   %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
+// SIBLING-MAXIMAL-NEXT:   affine.for %{{.*}} = 0 to 1 {
+// SIBLING-MAXIMAL-NEXT:     affine.for %{{.*}} = 0 to 64 {
+// SIBLING-MAXIMAL-NEXT:       affine.for %{{.*}} = 0 to 32 iter_args(%{{.*}} = %[[cst_1]]) -> (f32) {
+// SIBLING-MAXIMAL-NEXT:       affine.for %{{.*}} = 0 to 64 iter_args(%{{.*}} = %[[cst_0]]) -> (f32) {
 
 // -----
 
@@ -315,11 +318,16 @@ func.func @same_memref_load_store(%producer : memref<32xf32>, %consumer: memref<
   return
 }
 
+// -----
+
 // PRODUCER-CONSUMER-LABEL: func @same_memref_load_multiple_stores
+// ALL-LABEL: func @same_memref_load_multiple_stores
 func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %producer_2 : memref<32xf32>, %consumer: memref<16xf32>){
   %cst = arith.constant 2.000000e+00 : f32
-  // Source isn't removed.
+  // Ensure that source isn't removed during both producer-consumer fusion and
+  // sibling fusion.
   // PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 32
+  // ALL: affine.for %{{.*}} = 0 to 32
   affine.for %arg3 = 0 to 32 {
     %0 = affine.load %producer[%arg3] : memref<32xf32>
     %2 = arith.mulf %0, %cst : f32
@@ -343,5 +351,8 @@ func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %produce
   // PRODUCER-CONSUMER-NEXT:   arith.addf
   // PRODUCER-CONSUMER-NEXT:   affine.store
   // PRODUCER-CONSUMER-NEXT: }
+  // ALL:     affine.for %{{.*}} = 0 to 16
+  // ALL:       mulf
+  // ALL:       addf
   return
 }
diff --git a/mlir/test/Dialect/Affine/loop-fusion.mlir b/mlir/test/Dialect/Affine/loop-fusion.mlir
index 1c119e87c5336..dcd2e1cdb275a 100644
--- a/mlir/test/Dialect/Affine/loop-fusion.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion.mlir
@@ -1206,6 +1206,9 @@ func.func @should_fuse_with_private_memref() {
   // CHECK:      affine.for %{{.*}} = 0 to 17 {
   // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
   // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK:      affine.for %{{.*}} = 0 to 82 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
   // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
   // CHECK-NEXT: }
   // CHECK-NEXT: return