[Mlir-commits] [mlir] b2af2ae - [mlir] Mode for explicitly controlling the fusion kind
Sumesh Udayakumaran
llvmlistbot at llvm.org
Mon Sep 27 10:48:35 PDT 2021
Author: Sumesh Udayakumaran
Date: 2021-09-27T20:37:42+03:00
New Revision: b2af2aeea679b995b7d430567b3dc9fde6cfd3e0
URL: https://github.com/llvm/llvm-project/commit/b2af2aeea679b995b7d430567b3dc9fde6cfd3e0
DIFF: https://github.com/llvm/llvm-project/commit/b2af2aeea679b995b7d430567b3dc9fde6cfd3e0.diff
LOG: [mlir] Mode for explicitly controlling the fusion kind
New mode option that allows for either running the default fusion kind that happens today or doing either of producer-consumer or sibling fusion. This will also be helpful to minimize the compile-time of the fusion tests.
Reviewed By: bondhugula, dcaballe
Differential Revision: https://reviews.llvm.org/D110102
Added:
Modified:
mlir/include/mlir/Transforms/Passes.h
mlir/include/mlir/Transforms/Passes.td
mlir/lib/Transforms/LoopFusion.cpp
mlir/lib/Transforms/PassDetail.h
mlir/test/Transforms/loop-fusion-4.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index eef88b5d7a48c..3a5b9bcecbdab 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -25,6 +25,10 @@ namespace mlir {
class AffineForOp;
class GreedyRewriteConfig;
+/// Fusion mode to attempt. The default mode `Greedy` does both
+/// producer-consumer and sibling fusion.
+enum FusionMode { Greedy, ProducerConsumer, Sibling };
+
//===----------------------------------------------------------------------===//
// Passes
//===----------------------------------------------------------------------===//
@@ -72,13 +76,14 @@ createCanonicalizerPass(const GreedyRewriteConfig &config);
/// Creates a pass to perform common sub expression elimination.
std::unique_ptr<Pass> createCSEPass();
-/// Creates a loop fusion pass which fuses loops. Buffers of size less than or
-/// equal to `localBufSizeThreshold` are promoted to memory space
-/// `fastMemorySpace'.
+/// Creates a loop fusion pass which fuses loops according to type of fusion
+/// specified in `fusionMode`. Buffers of size less than or equal to
+/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`.
std::unique_ptr<OperationPass<FuncOp>>
createLoopFusionPass(unsigned fastMemorySpace = 0,
uint64_t localBufSizeThreshold = 0,
- bool maximalFusion = false);
+ bool maximalFusion = false,
+ enum FusionMode fusionMode = FusionMode::Greedy);
/// Creates a loop invariant code motion pass that hoists loop invariant
/// instructions out of the loop.
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 91af2a2c56a93..360b98d87c156 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -136,7 +136,15 @@ def AffineLoopFusion : FunctionPass<"affine-loop-fusion"> {
"to fast memory space">,
Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
"Enables maximal loop fusion">,
- ];
+ Option<"affineFusionMode", "mode", "enum FusionMode",
+ "mlir::FusionMode::Greedy", "fusion mode to attempt",
+ "llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy,"
+ " \"greedy\", \"Perform greedy (both producer-consumer and sibling) fusion\"), "
+ "clEnumValN( mlir::FusionMode::ProducerConsumer, "
+ "\"producer\", \"Perform only producer-consumer fusion\"), "
+ "clEnumValN( mlir::FusionMode::Sibling, "
+ "\"sibling\", \"Perform only sibling fusion\"))">,
+ ];
let dependentDialects = ["memref::MemRefDialect"];
}
diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp
index c19c887a593d6..6a456ea84b350 100644
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@@ -49,10 +49,11 @@ namespace {
struct LoopFusion : public AffineLoopFusionBase<LoopFusion> {
LoopFusion() = default;
LoopFusion(unsigned fastMemorySpace, uint64_t localBufSizeThresholdBytes,
- bool maximalFusion) {
+ bool maximalFusion, enum FusionMode affineFusionMode) {
this->fastMemorySpace = fastMemorySpace;
this->localBufSizeThreshold = localBufSizeThresholdBytes / 1024;
this->maximalFusion = maximalFusion;
+ this->affineFusionMode = affineFusionMode;
}
void runOnFunction() override;
@@ -62,9 +63,10 @@ struct LoopFusion : public AffineLoopFusionBase<LoopFusion> {
std::unique_ptr<OperationPass<FuncOp>>
mlir::createLoopFusionPass(unsigned fastMemorySpace,
- uint64_t localBufSizeThreshold, bool maximalFusion) {
+ uint64_t localBufSizeThreshold, bool maximalFusion,
+ enum FusionMode affineFusionMode) {
return std::make_unique<LoopFusion>(fastMemorySpace, localBufSizeThreshold,
- maximalFusion);
+ maximalFusion, affineFusionMode);
}
namespace {
@@ -1391,13 +1393,25 @@ struct GreedyFusion {
worklist.push_back(node.id);
}
}
+ /// Run only sibling fusion on the `mdg`.
+ void runSiblingFusionOnly() {
+ fuseSiblingNodes();
+ eraseUnusedMemRefAllocations();
+ }
+
+ /// Run only producer/consumer fusion on the `mdg`.
+ void runProducerConsumerFusionOnly() {
+ fuseProducerConsumerNodes(
+ /*maxSrcUserCount=*/std::numeric_limits<unsigned>::max());
+ eraseUnusedMemRefAllocations();
+ }
// Run the GreedyFusion pass.
// *) First pass through the nodes fuses single-use producer nodes into their
// unique consumer.
// *) Second pass fuses sibling nodes which share no dependence edges.
// *) Third pass fuses any remaining producer nodes into their users.
- void run() {
+ void runGreedyFusion() {
// TODO: Run this repeatedly until a fixed-point is reached.
fuseProducerConsumerNodes(/*maxSrcUserCount=*/1);
fuseSiblingNodes();
@@ -1971,5 +1985,11 @@ void LoopFusion::runOnFunction() {
unsigned localBufSizeThresholdBytes = localBufSizeThreshold * 1024;
GreedyFusion fusion(&g, localBufSizeThresholdBytes, fastMemorySpaceOpt,
maximalFusion, computeToleranceThreshold);
- fusion.run();
+
+ if (affineFusionMode == FusionMode::ProducerConsumer)
+ fusion.runProducerConsumerFusionOnly();
+ else if (affineFusionMode == FusionMode::Sibling)
+ fusion.runSiblingFusionOnly();
+ else
+ fusion.runGreedyFusion();
}
diff --git a/mlir/lib/Transforms/PassDetail.h b/mlir/lib/Transforms/PassDetail.h
index 2cb0e12b1cf20..c6a67e32511bc 100644
--- a/mlir/lib/Transforms/PassDetail.h
+++ b/mlir/lib/Transforms/PassDetail.h
@@ -10,6 +10,7 @@
#define TRANSFORMS_PASSDETAIL_H_
#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
namespace mlir {
class AffineDialect;
diff --git a/mlir/test/Transforms/loop-fusion-4.mlir b/mlir/test/Transforms/loop-fusion-4.mlir
index 61fd4e3c777de..15b345e3780b5 100644
--- a/mlir/test/Transforms/loop-fusion-4.mlir
+++ b/mlir/test/Transforms/loop-fusion-4.mlir
@@ -1,54 +1,13 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion -split-input-file | FileCheck %s
-// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal" -split-input-file | FileCheck %s --check-prefix=MAXIMAL
+// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="mode=producer" -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER
+// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal mode=sibling" -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL
-// Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir.
+// Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir.
// Part II of fusion tests in mlir/test/Transforms/loop-fusion-2.mlir
// Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir
-// -----
-
-func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) {
- %cst_0 = constant 0.000000e+00 : f32
- %cst_1 = constant 1.000000e+00 : f32
- affine.for %arg3 = 0 to 1 {
- affine.for %arg4 = 0 to 64 {
- %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
- %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
- %5 = addf %prevAccum, %4 : f32
- affine.yield %5 : f32
- }
- %accum_dbl = addf %accum, %accum : f32
- affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1>
- }
- }
- affine.for %arg3 = 0 to 1 {
- affine.for %arg4 = 0 to 64 {
- // Following loop trip count does not match the corresponding source trip count.
- %accum = affine.for %arg5 = 0 to 32 iter_args (%prevAccum = %cst_1) -> f32 {
- %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
- %5 = mulf %prevAccum, %4 : f32
- affine.yield %5 : f32
- }
- %accum_sqr = mulf %accum, %accum : f32
- affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1>
- }
- }
- return
-}
-// Test checks the loop structure is preserved after sibling fusion
-// since the destination loop and source loop trip counts do not
-// match.
-// MAXIMAL-LABEL: func @reduce_add_non_maximal_f32_f32(
-// MAXIMAL: %[[cst_0:.*]] = constant 0.000000e+00 : f32
-// MAXIMAL-NEXT: %[[cst_1:.*]] = constant 1.000000e+00 : f32
-// MAXIMAL-NEXT: affine.for %[[idx_0:.*]]= 0 to 1 {
-// MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 {
-// MAXIMAL-NEXT: %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {
-// MAXIMAL-NEXT: %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {
-
// Expects fusion of producer into consumer at depth 4 and subsequent removal of
// source loop.
-// CHECK-LABEL: func @unflatten4d
+// PRODUCER-CONSUMER-LABEL: func @unflatten4d
func @unflatten4d(%arg1: memref<7x8x9x10xf32>) {
%m = memref.alloc() : memref<5040xf32>
%cf7 = constant 7.0 : f32
@@ -75,18 +34,18 @@ func @unflatten4d(%arg1: memref<7x8x9x10xf32>) {
return
}
-// CHECK: affine.for
-// CHECK-NEXT: affine.for
-// CHECK-NEXT: affine.for
-// CHECK-NEXT: affine.for
-// CHECK-NOT: affine.for
-// CHECK: return
+// PRODUCER-CONSUMER: affine.for
+// PRODUCER-CONSUMER-NEXT: affine.for
+// PRODUCER-CONSUMER-NEXT: affine.for
+// PRODUCER-CONSUMER-NEXT: affine.for
+// PRODUCER-CONSUMER-NOT: affine.for
+// PRODUCER-CONSUMER: return
// -----
// Expects fusion of producer into consumer at depth 2 and subsequent removal of
// source loop.
-// CHECK-LABEL: func @unflatten2d_with_transpose
+// PRODUCER-CONSUMER-LABEL: func @unflatten2d_with_transpose
func @unflatten2d_with_transpose(%arg1: memref<8x7xf32>) {
%m = memref.alloc() : memref<56xf32>
%cf7 = constant 7.0 : f32
@@ -105,7 +64,48 @@ func @unflatten2d_with_transpose(%arg1: memref<8x7xf32>) {
return
}
-// CHECK: affine.for
-// CHECK-NEXT: affine.for
-// CHECK-NOT: affine.for
-// CHECK: return
\ No newline at end of file
+// PRODUCER-CONSUMER: affine.for
+// PRODUCER-CONSUMER-NEXT: affine.for
+// PRODUCER-CONSUMER-NOT: affine.for
+// PRODUCER-CONSUMER: return
+
+// -----
+
+// SIBLING-MAXIMAL-LABEL: func @reduce_add_non_maximal_f32_f32(
+func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) {
+ %cst_0 = constant 0.000000e+00 : f32
+ %cst_1 = constant 1.000000e+00 : f32
+ affine.for %arg3 = 0 to 1 {
+ affine.for %arg4 = 0 to 64 {
+ %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
+ %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
+ %5 = addf %prevAccum, %4 : f32
+ affine.yield %5 : f32
+ }
+ %accum_dbl = addf %accum, %accum : f32
+ affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1>
+ }
+ }
+ affine.for %arg3 = 0 to 1 {
+ affine.for %arg4 = 0 to 64 {
+ // Following loop trip count does not match the corresponding source trip count.
+ %accum = affine.for %arg5 = 0 to 32 iter_args (%prevAccum = %cst_1) -> f32 {
+ %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
+ %5 = mulf %prevAccum, %4 : f32
+ affine.yield %5 : f32
+ }
+ %accum_sqr = mulf %accum, %accum : f32
+ affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1>
+ }
+ }
+ return
+}
+// Test checks the loop structure is preserved after sibling fusion
+// since the destination loop and source loop trip counts do not
+// match.
+// SIBLING-MAXIMAL: %[[cst_0:.*]] = constant 0.000000e+00 : f32
+// SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = constant 1.000000e+00 : f32
+// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_0:.*]]= 0 to 1 {
+// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 {
+// SIBLING-MAXIMAL-NEXT: %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {
+// SIBLING-MAXIMAL-NEXT: %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {
\ No newline at end of file
More information about the Mlir-commits
mailing list