[Mlir-commits] [mlir] [mlir][affine]make affine-loop-unroll a FunctionOpInterface pass. (PR #126475)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Sun Feb 9 23:09:54 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-affine
Author: lonely eagle (linuxlonelyeagle)
<details>
<summary>Changes</summary>
Make `affine-loop-unroll` a `FunctionOpInterface` pass.Now unroll can be done on gpu.func.
---
Full diff: https://github.com/llvm/llvm-project/pull/126475.diff
4 Files Affected:
- (modified) mlir/include/mlir/Dialect/Affine/Passes.h (+2-1)
- (modified) mlir/include/mlir/Dialect/Affine/Passes.td (+1-1)
- (modified) mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp (+40-35)
- (modified) mlir/test/Dialect/Affine/unroll.mlir (+100)
``````````diff
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
index bc29d04287ac462..37147b079e5d992 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -19,6 +19,7 @@
namespace mlir {
+class ModuleOp;
namespace func {
class FuncOp;
} // namespace func
@@ -93,7 +94,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLoopTilingPass();
/// factors supplied through other means. If -1 is passed as the unrollFactor
/// and no callback is provided, anything passed from the command-line (if at
/// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor).
-std::unique_ptr<OperationPass<func::FuncOp>> createLoopUnrollPass(
+std::unique_ptr<OperationPass<mlir::ModuleOp>> createLoopUnrollPass(
int unrollFactor = -1, bool unrollUpToFactor = false,
bool unrollFull = false,
const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr);
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index d7c7897c6573016..d96b50c3e81043c 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -199,7 +199,7 @@ def AffineLoopTiling : Pass<"affine-loop-tile", "func::FuncOp"> {
];
}
-def AffineLoopUnroll : Pass<"affine-loop-unroll", "func::FuncOp"> {
+def AffineLoopUnroll : Pass<"affine-loop-unroll", "ModuleOp"> {
let summary = "Unroll affine loops";
let constructor = "mlir::affine::createLoopUnrollPass()";
let options = [
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
index 57df7ada91654c0..4dc9809574115eb 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
@@ -19,6 +19,7 @@
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -82,7 +83,7 @@ static bool isInnermostAffineForOp(AffineForOp op) {
}
/// Gathers loops that have no affine.for's nested within.
-static void gatherInnermostLoops(func::FuncOp f,
+static void gatherInnermostLoops(FunctionOpInterface f,
SmallVectorImpl<AffineForOp> &loops) {
f.walk([&](AffineForOp forOp) {
if (isInnermostAffineForOp(forOp))
@@ -91,40 +92,44 @@ static void gatherInnermostLoops(func::FuncOp f,
}
void LoopUnroll::runOnOperation() {
- func::FuncOp func = getOperation();
- if (func.isExternal())
- return;
-
- if (unrollFull && unrollFullThreshold.hasValue()) {
- // Store short loops as we walk.
+ mlir::ModuleOp module = getOperation();
+ SmallVector<FunctionOpInterface> funcOps;
+ module.walk([&](FunctionOpInterface func) { funcOps.push_back(func); });
+ for (auto func : funcOps) {
+ if (func.isExternal())
+ return;
+
+ if (unrollFull && unrollFullThreshold.hasValue()) {
+ // Store short loops as we walk.
+ SmallVector<AffineForOp, 4> loops;
+
+ // Gathers all loops with trip count <= minTripCount. Do a post order walk
+ // so that loops are gathered from innermost to outermost (or else
+ // unrolling an outer one may delete gathered inner ones).
+ getOperation().walk([&](AffineForOp forOp) {
+ std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+ if (tripCount && *tripCount <= unrollFullThreshold)
+ loops.push_back(forOp);
+ });
+ for (auto forOp : loops)
+ (void)loopUnrollFull(forOp);
+ return;
+ }
+
+ // If the call back is provided, we will recurse until no loops are found.
SmallVector<AffineForOp, 4> loops;
-
- // Gathers all loops with trip count <= minTripCount. Do a post order walk
- // so that loops are gathered from innermost to outermost (or else unrolling
- // an outer one may delete gathered inner ones).
- getOperation().walk([&](AffineForOp forOp) {
- std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
- if (tripCount && *tripCount <= unrollFullThreshold)
- loops.push_back(forOp);
- });
- for (auto forOp : loops)
- (void)loopUnrollFull(forOp);
- return;
- }
-
- // If the call back is provided, we will recurse until no loops are found.
- SmallVector<AffineForOp, 4> loops;
- for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
- loops.clear();
- gatherInnermostLoops(func, loops);
- if (loops.empty())
- break;
- bool unrolled = false;
- for (auto forOp : loops)
- unrolled |= succeeded(runOnAffineForOp(forOp));
- if (!unrolled)
- // Break out if nothing was unrolled.
- break;
+ for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
+ loops.clear();
+ gatherInnermostLoops(func, loops);
+ if (loops.empty())
+ break;
+ bool unrolled = false;
+ for (auto forOp : loops)
+ unrolled |= succeeded(runOnAffineForOp(forOp));
+ if (!unrolled)
+ // Break out if nothing was unrolled.
+ break;
+ }
}
}
@@ -145,7 +150,7 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) {
cleanUpUnroll);
}
-std::unique_ptr<OperationPass<func::FuncOp>> mlir::affine::createLoopUnrollPass(
+std::unique_ptr<OperationPass<ModuleOp>> mlir::affine::createLoopUnrollPass(
int unrollFactor, bool unrollUpToFactor, bool unrollFull,
const std::function<unsigned(AffineForOp)> &getUnrollFactor) {
return std::make_unique<LoopUnroll>(
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index e398c3fe2011dd8..43485ca56deeba5 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -240,6 +240,23 @@ func.func @loop_nest_unroll_full() {
return
} // UNROLL-FULL }
+gpu.module @unroll_full {
+ // UNROLL-FULL-LABEL: func @gpu_loop_nest_simplest() {
+ gpu.func @gpu_loop_nest_simplest() {
+ // UNROLL-FULL: affine.for %arg0 = 0 to 100 step 2 {
+ affine.for %i = 0 to 100 step 2 {
+ // UNROLL-FULL: %c1_i32 = arith.constant 1 : i32
+ // UNROLL-FULL-NEXT: %c1_i32_0 = arith.constant 1 : i32
+ // UNROLL-FULL-NEXT: %c1_i32_1 = arith.constant 1 : i32
+ // UNROLL-FULL-NEXT: %c1_i32_2 = arith.constant 1 : i32
+ affine.for %j = 0 to 4 {
+ %x = arith.constant 1 : i32
+ }
+ } // UNROLL-FULL: }
+ gpu.return // UNROLL-FULL: return
+ }
+}
+
// SHORT-LABEL: func @loop_nest_outer_unroll() {
func.func @loop_nest_outer_unroll() {
// SHORT: affine.for %arg0 = 0 to 4 {
@@ -260,6 +277,28 @@ func.func @loop_nest_outer_unroll() {
return // SHORT: return
} // SHORT }
+gpu.module @short {
+ // SHORT-LABEL: func @gpu_loop_nest_outer_unroll() {
+ gpu.func @gpu_loop_nest_outer_unroll() {
+ // SHORT: affine.for %arg0 = 0 to 4 {
+ // SHORT-NEXT: %0 = affine.apply [[$MAP0]](%arg0)
+ // SHORT-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
+ // SHORT-NEXT: }
+ // SHORT-NEXT: affine.for %arg0 = 0 to 4 {
+ // SHORT-NEXT: %0 = affine.apply [[$MAP0]](%arg0)
+ // SHORT-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
+ // SHORT-NEXT: }
+ affine.for %i = 0 to 2 {
+ affine.for %j = 0 to 4 {
+ %x = "affine.apply" (%j) { map = affine_map<(d0) -> (d0 + 1)> } :
+ (index) -> (index)
+ %y = "addi32"(%x, %x) : (index, index) -> index
+ }
+ }
+ gpu.return // SHORT: gpu.return
+ } // SHORT }
+}
+
// We are doing a minimal FileCheck here. We just need this test case to
// successfully run. Both %x and %y will get unrolled here as the min trip
// count threshold set to 2.
@@ -345,6 +384,37 @@ func.func @unroll_unit_stride_no_cleanup() {
return
}
+gpu.module @unroll_by_4{
+ // UNROLL-BY-4-LABEL: func @gpu_unroll_unit_stride_no_cleanup() {
+ gpu.func @gpu_unroll_unit_stride_no_cleanup() {
+ // UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
+ affine.for %i = 0 to 100 {
+ // UNROLL-BY-4: for [[L1:%arg[0-9]+]] = 0 to 8 step 4 {
+ // UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
+ // UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
+ // UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]*}}([[L1]])
+ // UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32
+ // UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
+ // UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]*}}([[L1]])
+ // UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
+ // UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32
+ // UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]*}}([[L1]])
+ // UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32
+ // UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32
+ // UNROLL-BY-4-NEXT: }
+ affine.for %j = 0 to 8 {
+ %x = "addi32"(%j, %j) : (index, index) -> i32
+ %y = "addi32"(%x, %x) : (i32, i32) -> i32
+ }
+ // empty loop
+ // UNROLL-BY-4: affine.for %arg1 = 0 to 8 {
+ affine.for %k = 0 to 8 {
+ }
+ }
+ gpu.return
+ }
+}
+
// UNROLL-BY-4-LABEL: func @unroll_unit_stride_cleanup() {
func.func @unroll_unit_stride_cleanup() {
// UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
@@ -632,6 +702,19 @@ func.func @unroll_by_one_should_promote_single_iteration_loop() {
// UNROLL-BY-1-NEXT: return
}
+gpu.module @unroll_by_1 {
+ // UNROLL-BY-1-LABEL: func @gpu_unroll_by_one_should_promote_single_iteration_loop()
+ gpu.func @gpu_unroll_by_one_should_promote_single_iteration_loop() {
+ affine.for %i = 0 to 1 {
+ %x = "foo"(%i) : (index) -> i32
+ }
+ gpu.return
+ // UNROLL-BY-1-NEXT: %c0 = arith.constant 0 : index
+ // UNROLL-BY-1-NEXT: %0 = "foo"(%c0) : (index) -> i32
+ // UNROLL-BY-1-NEXT: gpu.return
+ }
+}
+
// Test unrolling with affine.for iter_args.
// UNROLL-BY-4-LABEL: loop_unroll_with_iter_args_and_cleanup
@@ -706,6 +789,23 @@ func.func @unroll_cleanup_loop_with_larger_unroll_factor() {
// UNROLL-CLEANUP-LOOP-NEXT: return
}
+gpu.module @unroll_cleanup_loop {
+ // UNROLL-CLEANUP-LOOP-LABEL: func @gpu_unroll_cleanup_loop_with_larger_unroll_factor()
+ gpu.func @gpu_unroll_cleanup_loop_with_larger_unroll_factor() {
+ affine.for %i = 0 to 3 {
+ %x = "foo"(%i) : (index) -> i32
+ }
+ gpu.return
+ // UNROLL-CLEANUP-LOOP-NEXT: %[[C0:.*]] = arith.constant 0 : index
+ // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[C0]]) : (index) -> i32
+ // UNROLL-CLEANUP-LOOP-NEXT: %[[V1:.*]] = affine.apply {{.*}}
+ // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V1]]) : (index) -> i32
+ // UNROLL-CLEANUP-LOOP-NEXT: %[[V2:.*]] = affine.apply {{.*}}
+ // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V2]]) : (index) -> i32
+ // UNROLL-CLEANUP-LOOP-NEXT: gpu.return
+ }
+}
+
// UNROLL-CLEANUP-LOOP-LABEL: func @unroll_cleanup_loop_with_smaller_unroll_factor()
func.func @unroll_cleanup_loop_with_smaller_unroll_factor() {
affine.for %i = 0 to 7 {
``````````
</details>
https://github.com/llvm/llvm-project/pull/126475
More information about the Mlir-commits
mailing list