[Mlir-commits] [mlir] [MLIR][Affine] Fix sibling fusion - missing check (PR #126626)
Uday Bondhugula
llvmlistbot at llvm.org
Wed Feb 12 15:57:14 PST 2025
https://github.com/bondhugula updated https://github.com/llvm/llvm-project/pull/126626
>From 7a03b9fa54d084a4d7cf3142c200357066ca2671 Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday at polymagelabs.com>
Date: Tue, 11 Feb 2025 09:01:16 +0530
Subject: [PATCH] [MLIR][Affine] Fix sibling fusion - missing check
Fix sibling fusion for slice maximality check. Producer-consumer fusion
had this check but not sibling fusion. Sibling fusion shouldn't be
performed if the slice isn't "maximal" (i.e., if it isn't the whole of
the source).
Fixes: https://github.com/llvm/llvm-project/issues/48703
---
.../Dialect/Affine/Transforms/LoopFusion.cpp | 41 +++++++++++++------
mlir/test/Dialect/Affine/loop-fusion-2.mlir | 2 +
mlir/test/Dialect/Affine/loop-fusion-4.mlir | 23 ++++++++---
mlir/test/Dialect/Affine/loop-fusion.mlir | 3 ++
4 files changed, 51 insertions(+), 18 deletions(-)
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
index b38dd8effe669..74be68943ee98 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -1162,24 +1162,48 @@ struct GreedyFusion {
}
assert(bestDstLoopDepth > 0 && "Unexpected loop fusion depth");
- assert(!depthSliceUnions[bestDstLoopDepth - 1].isEmpty() &&
+
+ const ComputationSliceState &bestSlice =
+ depthSliceUnions[bestDstLoopDepth - 1];
+ assert(!bestSlice.isEmpty() &&
"Fusion depth has no computed slice union");
+
+ // Do not perform sibling fusion if it isn't maximal. We always remove the
+ // sibling node and as such fusion shouldn't be performed if a part of the
+ // slice is used in the destination.
+ auto isMaximal = bestSlice.isMaximal();
+ if (!isMaximal.value_or(false)) {
+ LLVM_DEBUG(llvm::dbgs()
+ << "Slice isn't maximal; not performing sibling fusion.\n");
+ continue;
+ }
+
// Check if source loop is being inserted in the innermost
// destination loop. Based on this, the fused loop may be optimized
// further inside `fuseLoops`.
bool isInnermostInsertion = (bestDstLoopDepth == dstLoopDepthTest);
// Fuse computation slice of 'sibLoopNest' into 'dstLoopNest'.
- affine::fuseLoops(sibAffineForOp, dstAffineForOp,
- depthSliceUnions[bestDstLoopDepth - 1],
+ affine::fuseLoops(sibAffineForOp, dstAffineForOp, bestSlice,
isInnermostInsertion);
auto dstForInst = cast<AffineForOp>(dstNode->op);
// Update operation position of fused loop nest (if needed).
- if (insertPointInst != dstForInst) {
+ if (insertPointInst != dstForInst)
dstForInst->moveBefore(insertPointInst);
- }
+
+ LLVM_DEBUG(llvm::dbgs()
+ << "Fused sibling nest " << sibId << " into destination nest "
+ << dstNode->id << " at depth " << bestDstLoopDepth << ":\n"
+ << dstAffineForOp << "\n");
+
// Update data dependence graph state post fusion.
updateStateAfterSiblingFusion(sibNode, dstNode);
+
+ // Remove old sibling loop nest.
+ // Get op before we invalidate the MDG node.
+ Operation *op = sibNode->op;
+ mdg->removeNode(sibNode->id);
+ op->erase();
}
}
@@ -1321,13 +1345,6 @@ struct GreedyFusion {
mdg->addToNode(dstNode->id, dstLoopCollector.loadOpInsts,
dstLoopCollector.storeOpInsts, dstLoopCollector.memrefLoads,
dstLoopCollector.memrefStores, dstLoopCollector.memrefFrees);
- // Remove old sibling loop nest if it no longer has outgoing dependence
- // edges, and it does not write to a memref which escapes the block.
- if (mdg->getOutEdgeCount(sibNode->id) == 0) {
- Operation *op = sibNode->op;
- mdg->removeNode(sibNode->id);
- op->erase();
- }
}
// Clean up any allocs with no users.
diff --git a/mlir/test/Dialect/Affine/loop-fusion-2.mlir b/mlir/test/Dialect/Affine/loop-fusion-2.mlir
index 8fec24f71b14a..99207e4910462 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-2.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-2.mlir
@@ -389,6 +389,8 @@ func.func @should_fuse_init_loops_siblings_then_shared_producer(%arg0: memref<10
// -----
+// Test sibling fusion of two matrix-vector products sharing the input matrix.
+
func.func @two_matrix_vector_products() {
%in_matrix = memref.alloc() : memref<10x10xf32>
%in_vec0 = memref.alloc() : memref<10xf32>
diff --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
index 2830235431c76..813b93a248e4a 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-4.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
@@ -1,5 +1,7 @@
// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER
// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal mode=sibling}))' -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL
+// All fusion: producer-consumer and sibling.
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s --check-prefix=ALL
// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(spirv.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=SPIRV
// Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir.
@@ -108,6 +110,7 @@ func.func @check_src_dst_step(%m : memref<100xf32>,
func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) {
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant 1.000000e+00 : f32
+ // This nest writes to %arg1 but can be eliminated post sibling fusion.
affine.for %arg3 = 0 to 1 {
affine.for %arg4 = 0 to 64 {
%accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
@@ -137,11 +140,11 @@ func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : m
// since the destination loop and source loop trip counts do not
// match.
// SIBLING-MAXIMAL: %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32
-// SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
-// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_0:.*]]= 0 to 1 {
-// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 {
-// SIBLING-MAXIMAL-NEXT: %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {
-// SIBLING-MAXIMAL-NEXT: %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {
+// SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
+// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 1 {
+// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 64 {
+// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 32 iter_args(%{{.*}} = %[[cst_1]]) -> (f32) {
+// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 64 iter_args(%{{.*}} = %[[cst_0]]) -> (f32) {
// -----
@@ -315,11 +318,16 @@ func.func @same_memref_load_store(%producer : memref<32xf32>, %consumer: memref<
return
}
+// -----
+
// PRODUCER-CONSUMER-LABEL: func @same_memref_load_multiple_stores
+// ALL-LABEL: func @same_memref_load_multiple_stores
func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %producer_2 : memref<32xf32>, %consumer: memref<16xf32>){
%cst = arith.constant 2.000000e+00 : f32
- // Source isn't removed.
+ // Ensure that source isn't removed during both producer-consumer fusion and
+ // sibling fusion.
// PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 32
+ // ALL: affine.for %{{.*}} = 0 to 32
affine.for %arg3 = 0 to 32 {
%0 = affine.load %producer[%arg3] : memref<32xf32>
%2 = arith.mulf %0, %cst : f32
@@ -343,5 +351,8 @@ func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %produce
// PRODUCER-CONSUMER-NEXT: arith.addf
// PRODUCER-CONSUMER-NEXT: affine.store
// PRODUCER-CONSUMER-NEXT: }
+ // ALL: affine.for %{{.*}} = 0 to 16
+ // ALL: mulf
+ // ALL: addf
return
}
diff --git a/mlir/test/Dialect/Affine/loop-fusion.mlir b/mlir/test/Dialect/Affine/loop-fusion.mlir
index 1c119e87c5336..dcd2e1cdb275a 100644
--- a/mlir/test/Dialect/Affine/loop-fusion.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion.mlir
@@ -1206,6 +1206,9 @@ func.func @should_fuse_with_private_memref() {
// CHECK: affine.for %{{.*}} = 0 to 17 {
// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
// CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
+ // CHECK-NEXT: }
+ // CHECK: affine.for %{{.*}} = 0 to 82 {
+ // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
// CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
// CHECK-NEXT: }
// CHECK-NEXT: return
More information about the Mlir-commits
mailing list