[Mlir-commits] [mlir] 8421ad7 - [MLIR][Affine] Fix sibling fusion - missing check (#126626)

llvmlistbot at llvm.org llvmlistbot at llvm.org
Wed Feb 12 16:51:07 PST 2025


Author: Uday Bondhugula
Date: 2025-02-13T06:21:03+05:30
New Revision: 8421ad7f4515653001c3927734fc6204367478a0

URL: https://github.com/llvm/llvm-project/commit/8421ad7f4515653001c3927734fc6204367478a0
DIFF: https://github.com/llvm/llvm-project/commit/8421ad7f4515653001c3927734fc6204367478a0.diff

LOG: [MLIR][Affine] Fix sibling fusion - missing check (#126626)

Fix sibling fusion for slice maximality check. Producer-consumer fusion
had this check but not sibling fusion. Sibling fusion shouldn't be
performed if the slice isn't "maximal" (i.e., if it isn't the whole of
the source).


Fixes: https://github.com/llvm/llvm-project/issues/48703

Added: 
    

Modified: 
    mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
    mlir/test/Dialect/Affine/loop-fusion-2.mlir
    mlir/test/Dialect/Affine/loop-fusion-4.mlir
    mlir/test/Dialect/Affine/loop-fusion.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
index 7763831141c6b..30019447d94e8 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -1165,24 +1165,48 @@ struct GreedyFusion {
       }
 
       assert(bestDstLoopDepth > 0 && "Unexpected loop fusion depth");
-      assert(!depthSliceUnions[bestDstLoopDepth - 1].isEmpty() &&
+
+      const ComputationSliceState &bestSlice =
+          depthSliceUnions[bestDstLoopDepth - 1];
+      assert(!bestSlice.isEmpty() &&
              "Fusion depth has no computed slice union");
+
+      // Do not perform sibling fusion if it isn't maximal. We always remove the
+      // sibling node and as such fusion shouldn't be performed if a part of the
+      // slice is used in the destination.
+      auto isMaximal = bestSlice.isMaximal();
+      if (!isMaximal.value_or(false)) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Slice isn't maximal; not performing sibling fusion.\n");
+        continue;
+      }
+
       // Check if source loop is being inserted in the innermost
       // destination loop. Based on this, the fused loop may be optimized
       // further inside `fuseLoops`.
       bool isInnermostInsertion = (bestDstLoopDepth == dstLoopDepthTest);
       // Fuse computation slice of 'sibLoopNest' into 'dstLoopNest'.
-      affine::fuseLoops(sibAffineForOp, dstAffineForOp,
-                        depthSliceUnions[bestDstLoopDepth - 1],
+      affine::fuseLoops(sibAffineForOp, dstAffineForOp, bestSlice,
                         isInnermostInsertion);
 
       auto dstForInst = cast<AffineForOp>(dstNode->op);
       // Update operation position of fused loop nest (if needed).
-      if (insertPointInst != dstForInst) {
+      if (insertPointInst != dstForInst)
         dstForInst->moveBefore(insertPointInst);
-      }
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Fused sibling nest " << sibId << " into destination nest "
+                 << dstNode->id << " at depth " << bestDstLoopDepth << ":\n"
+                 << dstAffineForOp << "\n");
+
       // Update data dependence graph state post fusion.
       updateStateAfterSiblingFusion(sibNode, dstNode);
+
+      // Remove old sibling loop nest.
+      // Get op before we invalidate the MDG node.
+      Operation *op = sibNode->op;
+      mdg->removeNode(sibNode->id);
+      op->erase();
     }
   }
 
@@ -1324,13 +1348,6 @@ struct GreedyFusion {
     mdg->addToNode(dstNode->id, dstLoopCollector.loadOpInsts,
                    dstLoopCollector.storeOpInsts, dstLoopCollector.memrefLoads,
                    dstLoopCollector.memrefStores, dstLoopCollector.memrefFrees);
-    // Remove old sibling loop nest if it no longer has outgoing dependence
-    // edges, and it does not write to a memref which escapes the block.
-    if (mdg->getOutEdgeCount(sibNode->id) == 0) {
-      Operation *op = sibNode->op;
-      mdg->removeNode(sibNode->id);
-      op->erase();
-    }
   }
 
   // Clean up any allocs with no users.

diff  --git a/mlir/test/Dialect/Affine/loop-fusion-2.mlir b/mlir/test/Dialect/Affine/loop-fusion-2.mlir
index 8fec24f71b14a..99207e4910462 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-2.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-2.mlir
@@ -389,6 +389,8 @@ func.func @should_fuse_init_loops_siblings_then_shared_producer(%arg0: memref<10
 
 // -----
 
+// Test sibling fusion of two matrix-vector products sharing the input matrix.
+
 func.func @two_matrix_vector_products() {
   %in_matrix = memref.alloc() : memref<10x10xf32>
   %in_vec0 = memref.alloc() : memref<10xf32>

diff  --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
index 07d2d06f1451d..788d7f9470530 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-4.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
@@ -1,6 +1,8 @@
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer fusion-maximal}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER-MAXIMAL
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal mode=sibling}))' -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL
+// All fusion: producer-consumer and sibling.
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s --check-prefix=ALL
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(spirv.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=SPIRV
 
 // Part I of fusion tests in  mlir/test/Transforms/loop-fusion.mlir.
@@ -109,6 +111,7 @@ func.func @check_src_dst_step(%m : memref<100xf32>,
 func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) {
     %cst_0 = arith.constant 0.000000e+00 : f32
     %cst_1 = arith.constant 1.000000e+00 : f32
+    // This nest writes to %arg1 but can be eliminated post sibling fusion.
     affine.for %arg3 = 0 to 1 {
       affine.for %arg4 = 0 to 64 {
         %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
@@ -138,11 +141,11 @@ func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : m
 // since the destination loop and source loop trip counts do not
 // match.
 // SIBLING-MAXIMAL:        %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32
-// SIBLING-MAXIMAL-NEXT:        %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
-// SIBLING-MAXIMAL-NEXT:           affine.for %[[idx_0:.*]]= 0 to 1 {
-// SIBLING-MAXIMAL-NEXT:             affine.for %[[idx_1:.*]] = 0 to 64 {
-// SIBLING-MAXIMAL-NEXT:               %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {
-// SIBLING-MAXIMAL-NEXT:                 %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {
+// SIBLING-MAXIMAL-NEXT:   %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
+// SIBLING-MAXIMAL-NEXT:   affine.for %{{.*}} = 0 to 1 {
+// SIBLING-MAXIMAL-NEXT:     affine.for %{{.*}} = 0 to 64 {
+// SIBLING-MAXIMAL-NEXT:       affine.for %{{.*}} = 0 to 32 iter_args(%{{.*}} = %[[cst_1]]) -> (f32) {
+// SIBLING-MAXIMAL-NEXT:       affine.for %{{.*}} = 0 to 64 iter_args(%{{.*}} = %[[cst_0]]) -> (f32) {
 
 // -----
 
@@ -316,11 +319,16 @@ func.func @same_memref_load_store(%producer : memref<32xf32>, %consumer: memref<
   return
 }
 
+// -----
+
 // PRODUCER-CONSUMER-LABEL: func @same_memref_load_multiple_stores
+// ALL-LABEL: func @same_memref_load_multiple_stores
 func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %producer_2 : memref<32xf32>, %consumer: memref<16xf32>){
   %cst = arith.constant 2.000000e+00 : f32
-  // Source isn't removed.
+  // Ensure that source isn't removed during both producer-consumer fusion and
+  // sibling fusion.
   // PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 32
+  // ALL: affine.for %{{.*}} = 0 to 32
   affine.for %arg3 = 0 to 32 {
     %0 = affine.load %producer[%arg3] : memref<32xf32>
     %2 = arith.mulf %0, %cst : f32
@@ -344,6 +352,9 @@ func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %produce
   // PRODUCER-CONSUMER-NEXT:   arith.addf
   // PRODUCER-CONSUMER-NEXT:   affine.store
   // PRODUCER-CONSUMER-NEXT: }
+  // ALL:     affine.for %{{.*}} = 0 to 16
+  // ALL:       mulf
+  // ALL:       addf
   return
 }
 

diff  --git a/mlir/test/Dialect/Affine/loop-fusion.mlir b/mlir/test/Dialect/Affine/loop-fusion.mlir
index 1c119e87c5336..dcd2e1cdb275a 100644
--- a/mlir/test/Dialect/Affine/loop-fusion.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion.mlir
@@ -1206,6 +1206,9 @@ func.func @should_fuse_with_private_memref() {
   // CHECK:      affine.for %{{.*}} = 0 to 17 {
   // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
   // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
+  // CHECK-NEXT: }
+  // CHECK:      affine.for %{{.*}} = 0 to 82 {
+  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
   // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
   // CHECK-NEXT: }
   // CHECK-NEXT: return


        


More information about the Mlir-commits mailing list