[Mlir-commits] [mlir] cd73081 - [mlir] parallel loop tiling optimization for loops with static bounds
Tobias Gysi
llvmlistbot at llvm.org
Thu Jun 25 00:21:31 PDT 2020
Author: Tobias Gysi
Date: 2020-06-25T09:21:24+02:00
New Revision: cd730816058b4bed2623e2a9b505475525b74144
URL: https://github.com/llvm/llvm-project/commit/cd730816058b4bed2623e2a9b505475525b74144
DIFF: https://github.com/llvm/llvm-project/commit/cd730816058b4bed2623e2a9b505475525b74144.diff
LOG: [mlir] parallel loop tiling optimization for loops with static bounds
Summary: The patch optimizes the tiling of parallel loops with static bounds if the number of loop iterations is an integer multiple of the tile size.
Reviewers: herhut, ftynse, bondhugula
Reviewed By: herhut, ftynse
Subscribers: bondhugula, mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, stephenneuendorffer, Joonsoo, grosul1, frgossen, Kayjukh, jurahul, msifontes
Tags: #mlir
Differential Revision: https://reviews.llvm.org/D82003
Added:
Modified:
mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
mlir/test/Dialect/SCF/parallel-loop-tiling.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
index ee4428ded99e..7bcc989a5b28 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
@@ -16,8 +16,6 @@
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/SCF/Transforms.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Transforms/RegionUtils.h"
-#include "llvm/Support/CommandLine.h"
using namespace mlir;
using namespace mlir::scf;
@@ -30,8 +28,8 @@ using namespace mlir::scf;
/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
/// step (%arg4*tileSize[0],
/// %arg5*tileSize[1])
-/// scf.parallel (%j0, %j1) = (0, 0) to (min(tileSize[0], %arg2-%i0)
-/// min(tileSize[1], %arg3-%i1))
+/// scf.parallel (%j0, %j1) = (0, 0) to (min(%arg4*tileSize[0], %arg2-%i0)
+/// min(%arg5*tileSize[1], %arg3-%i1))
/// step (%arg4, %arg5)
///
/// where the uses of %i0 and %i1 in the loop body are replaced by
@@ -76,12 +74,36 @@ void mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
// Create the inner loop with adjusted bounds.
SmallVector<Value, 2> newBounds;
newBounds.reserve(op.upperBound().size());
- for (auto bounds : llvm::zip(tileSizeConstants, outerLoop.upperBound(),
- outerLoop.getInductionVars())) {
- newBounds.push_back(b.create<AffineMinOp>(
- op.getLoc(), b.getIndexType(), minMap,
- ValueRange{std::get<0>(bounds), std::get<1>(bounds),
- std::get<2>(bounds)}));
+ for (auto dim : llvm::zip(outerLoop.lowerBound(), outerLoop.upperBound(),
+ outerLoop.step(), outerLoop.getInductionVars(),
+ op.step(), tileSizeConstants)) {
+ Value lowerBound, upperBound, newStep, iv, step, tileSizeConstant;
+ std::tie(lowerBound, upperBound, newStep, iv, step, tileSizeConstant) = dim;
+ // Collect the statically known loop bounds
+ auto lowerBoundConstant =
+ dyn_cast_or_null<ConstantIndexOp>(lowerBound.getDefiningOp());
+ auto upperBoundConstant =
+ dyn_cast_or_null<ConstantIndexOp>(upperBound.getDefiningOp());
+ auto stepConstant = dyn_cast_or_null<ConstantIndexOp>(step.getDefiningOp());
+ auto tileSize =
+ cast<ConstantIndexOp>(tileSizeConstant.getDefiningOp()).getValue();
+ // If the loop bounds and the loop step are constant and if the number of
+ // loop iterations is an integer multiple of the tile size, we use a static
+ // bound for the inner loop.
+ if (lowerBoundConstant && upperBoundConstant && stepConstant) {
+ auto numIterations = llvm::divideCeil(upperBoundConstant.getValue() -
+ lowerBoundConstant.getValue(),
+ stepConstant.getValue());
+ if (numIterations % tileSize == 0) {
+ newBounds.push_back(newStep);
+ continue;
+ }
+ }
+ // Otherwise, we dynamically compute the bound for
+ // each iteration of the outer loop.
+ newBounds.push_back(
+ b.create<AffineMinOp>(op.getLoc(), b.getIndexType(), minMap,
+ ValueRange{newStep, upperBound, iv}));
}
auto innerLoop = b.create<ParallelOp>(
op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,
@@ -104,8 +126,8 @@ void mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
op.erase();
}
-/// Get a list of most nested parallel loops. Assumes that ParallelOps are only
-/// directly nested.
+/// Get a list of most nested parallel loops. Assumes that ParallelOps are
+/// only directly nested.
static bool getInnermostNestedLoops(Block *block,
SmallVectorImpl<ParallelOp> &loops) {
bool hasInnerLoop = false;
@@ -131,7 +153,9 @@ struct ParallelLoopTiling
getInnermostNestedLoops(&block, mostNestedParallelOps);
}
for (ParallelOp pLoop : mostNestedParallelOps) {
- tileParallelLoop(pLoop, tileSizes);
+ // FIXME: Add reduction support.
+ if (pLoop.getNumReductions() == 0)
+ tileParallelLoop(pLoop, tileSizes);
}
}
};
diff --git a/mlir/test/Dialect/SCF/parallel-loop-tiling.mlir b/mlir/test/Dialect/SCF/parallel-loop-tiling.mlir
index f12416266ed9..6dff4eeda9e7 100644
--- a/mlir/test/Dialect/SCF/parallel-loop-tiling.mlir
+++ b/mlir/test/Dialect/SCF/parallel-loop-tiling.mlir
@@ -15,22 +15,52 @@ func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
// CHECK: #map0 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>
// CHECK-LABEL: func @parallel_loop(
-// CHECK-SAME: [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: index, [[VAL_6:%.*]]: memref<?x?xf32>, [[VAL_7:%.*]]: memref<?x?xf32>, [[VAL_8:%.*]]: memref<?x?xf32>, [[VAL_9:%.*]]: memref<?x?xf32>) {
-// CHECK: [[VAL_10:%.*]] = constant 0 : index
-// CHECK: [[VAL_11:%.*]] = constant 1 : index
-// CHECK: [[VAL_12:%.*]] = constant 4 : index
-// CHECK: [[VAL_13:%.*]] = muli [[VAL_4]], [[VAL_11]] : index
-// CHECK: [[VAL_14:%.*]] = muli [[VAL_5]], [[VAL_12]] : index
-// CHECK: scf.parallel ([[VAL_15:%.*]], [[VAL_16:%.*]]) = ([[VAL_0]], [[VAL_1]]) to ([[VAL_2]], [[VAL_3]]) step ([[VAL_13]], [[VAL_14]]) {
-// CHECK: [[VAL_17:%.*]] = affine.min #map0([[VAL_11]], [[VAL_2]], [[VAL_15]])
-// CHECK: [[VAL_18:%.*]] = affine.min #map0([[VAL_12]], [[VAL_3]], [[VAL_16]])
-// CHECK: scf.parallel ([[VAL_19:%.*]], [[VAL_20:%.*]]) = ([[VAL_10]], [[VAL_10]]) to ([[VAL_17]], [[VAL_18]]) step ([[VAL_4]], [[VAL_5]]) {
-// CHECK: [[VAL_21:%.*]] = addi [[VAL_19]], [[VAL_15]] : index
-// CHECK: [[VAL_22:%.*]] = addi [[VAL_20]], [[VAL_16]] : index
-// CHECK: [[VAL_23:%.*]] = load [[VAL_7]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>
-// CHECK: [[VAL_24:%.*]] = load [[VAL_8]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>
-// CHECK: [[VAL_25:%.*]] = addf [[VAL_23]], [[VAL_24]] : f32
-// CHECK: store [[VAL_25]], [[VAL_9]]{{\[}}[[VAL_21]], [[VAL_22]]] : memref<?x?xf32>
+// CHECK-SAME: [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index, [[ARG5:%.*]]: index, [[ARG6:%.*]]: index, [[ARG7:%.*]]: memref<?x?xf32>, [[ARG8:%.*]]: memref<?x?xf32>, [[ARG9:%.*]]: memref<?x?xf32>, [[ARG10:%.*]]: memref<?x?xf32>) {
+// CHECK: [[C0:%.*]] = constant 0 : index
+// CHECK: [[C1:%.*]] = constant 1 : index
+// CHECK: [[C4:%.*]] = constant 4 : index
+// CHECK: [[V1:%.*]] = muli [[ARG5]], [[C1]] : index
+// CHECK: [[V2:%.*]] = muli [[ARG6]], [[C4]] : index
+// CHECK: scf.parallel ([[V3:%.*]], [[V4:%.*]]) = ([[ARG1]], [[ARG2]]) to ([[ARG3]], [[ARG4]]) step ([[V1]], [[V2]]) {
+// CHECK: [[V5:%.*]] = affine.min #map0([[V1]], [[ARG3]], [[V3]])
+// CHECK: [[V6:%.*]] = affine.min #map0([[V2]], [[ARG4]], [[V4]])
+// CHECK: scf.parallel ([[V7:%.*]], [[V8:%.*]]) = ([[C0]], [[C0]]) to ([[V5]], [[V6]]) step ([[ARG5]], [[ARG6]]) {
+// CHECK: [[V9:%.*]] = addi [[V7]], [[V3]] : index
+// CHECK: [[V10:%.*]] = addi [[V8]], [[V4]] : index
+// CHECK: [[V11:%.*]] = load [[ARG8]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
+// CHECK: [[V12:%.*]] = load [[ARG9]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
+// CHECK: [[V13:%.*]] = addf [[V11]], [[V12]] : f32
+// CHECK: store [[V13]], [[ARG10]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
+// CHECK: }
+// CHECK: }
+// CHECK: return
+
+// -----
+
+func @static_loop_with_step() {
+ %c0 = constant 0 : index
+ %c3 = constant 3 : index
+ %c22 = constant 22 : index
+ %c24 = constant 24 : index
+ scf.parallel (%i0, %i1) = (%c0, %c0) to (%c22, %c24) step (%c3, %c3) {
+ }
+ return
+}
+
+// CHECK-LABEL: func @static_loop_with_step() {
+// CHECK: [[C0:%.*]] = constant 0 : index
+// CHECK: [[C3:%.*]] = constant 3 : index
+// CHECK: [[C22:%.*]] = constant 22 : index
+// CHECK: [[C24:%.*]] = constant 24 : index
+// CHECK: [[C0_1:%.*]] = constant 0 : index
+// CHECK: [[C1:%.*]] = constant 1 : index
+// CHECK: [[C4:%.*]] = constant 4 : index
+// CHECK: [[V1:%.*]] = muli [[C3]], [[C1]] : index
+// CHECK: [[V2:%.*]] = muli [[C3]], [[C4]] : index
+// CHECK: scf.parallel ([[V3:%.*]], [[V4:%.*]]) = ([[C0]], [[C0]]) to ([[C22]], [[C24]]) step ([[V1]], [[V2]]) {
+// CHECK: scf.parallel ([[V5:%.*]], [[V6:%.*]]) = ([[C0_1]], [[C0_1]]) to ([[V1]], [[V2]]) step ([[C3]], [[C3]]) {
+// CHECK: = addi [[V5]], [[V3]] : index
+// CHECK: = addi [[V6]], [[V4]] : index
// CHECK: }
// CHECK: }
// CHECK: return
@@ -51,31 +81,33 @@ func @tile_nested_innermost() {
}
// CHECK-LABEL: func @tile_nested_innermost() {
-// CHECK: [[VAL_24:%.*]] = constant 2 : index
-// CHECK: [[VAL_25:%.*]] = constant 0 : index
-// CHECK: [[VAL_26:%.*]] = constant 1 : index
-// CHECK: scf.parallel ([[VAL_27:%.*]], [[VAL_28:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_26]], [[VAL_26]]) {
-// CHECK: [[VAL_29:%.*]] = constant 0 : index
-// CHECK: [[VAL_30:%.*]] = constant 1 : index
-// CHECK: [[VAL_31:%.*]] = constant 4 : index
-// CHECK: [[VAL_32:%.*]] = muli [[VAL_26]], [[VAL_30]] : index
-// CHECK: [[VAL_33:%.*]] = muli [[VAL_26]], [[VAL_31]] : index
-// CHECK: scf.parallel ([[VAL_34:%.*]], [[VAL_35:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_32]], [[VAL_33]]) {
-// CHECK: [[VAL_36:%.*]] = affine.min #map0([[VAL_30]], [[VAL_24]], [[VAL_34]])
-// CHECK: [[VAL_37:%.*]] = affine.min #map0([[VAL_31]], [[VAL_24]], [[VAL_35]])
-// CHECK: scf.parallel ([[VAL_38:%.*]], [[VAL_39:%.*]]) = ([[VAL_29]], [[VAL_29]]) to ([[VAL_36]], [[VAL_37]]) step ([[VAL_26]], [[VAL_26]]) {
+// CHECK: [[C2:%.*]] = constant 2 : index
+// CHECK: [[C0:%.*]] = constant 0 : index
+// CHECK: [[C1:%.*]] = constant 1 : index
+// CHECK: scf.parallel ([[V1:%.*]], [[V2:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[C1]], [[C1]]) {
+// CHECK: [[C0_1:%.*]] = constant 0 : index
+// CHECK: [[C1_1:%.*]] = constant 1 : index
+// CHECK: [[C4:%.*]] = constant 4 : index
+// CHECK: [[V3:%.*]] = muli [[C1]], [[C1_1]] : index
+// CHECK: [[V4:%.*]] = muli [[C1]], [[C4]] : index
+// CHECK: scf.parallel ([[V5:%.*]], [[V6:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V3]], [[V4]]) {
+// CHECK: [[V7:%.*]] = affine.min #map0([[V4]], [[C2]], [[V6]])
+// CHECK: scf.parallel ([[V8:%.*]], [[V9:%.*]]) = ([[C0_1]], [[C0_1]]) to ([[V3]], [[V7]]) step ([[C1]], [[C1]]) {
+// CHECK: = addi [[V8]], [[V5]] : index
+// CHECK: = addi [[V9]], [[V6]] : index
// CHECK: }
// CHECK: }
// CHECK: }
-// CHECK: [[VAL_40:%.*]] = constant 0 : index
-// CHECK: [[VAL_41:%.*]] = constant 1 : index
-// CHECK: [[VAL_42:%.*]] = constant 4 : index
-// CHECK: [[VAL_43:%.*]] = muli [[VAL_26]], [[VAL_41]] : index
-// CHECK: [[VAL_44:%.*]] = muli [[VAL_26]], [[VAL_42]] : index
-// CHECK: scf.parallel ([[VAL_45:%.*]], [[VAL_46:%.*]]) = ([[VAL_25]], [[VAL_25]]) to ([[VAL_24]], [[VAL_24]]) step ([[VAL_43]], [[VAL_44]]) {
-// CHECK: [[VAL_47:%.*]] = affine.min #map0([[VAL_41]], [[VAL_24]], [[VAL_45]])
-// CHECK: [[VAL_48:%.*]] = affine.min #map0([[VAL_42]], [[VAL_24]], [[VAL_46]])
-// CHECK: scf.parallel ([[VAL_49:%.*]], [[VAL_50:%.*]]) = ([[VAL_40]], [[VAL_40]]) to ([[VAL_47]], [[VAL_48]]) step ([[VAL_26]], [[VAL_26]]) {
+// CHECK: [[C0_2:%.*]] = constant 0 : index
+// CHECK: [[C1_2:%.*]] = constant 1 : index
+// CHECK: [[C4_1:%.*]] = constant 4 : index
+// CHECK: [[V10:%.*]] = muli [[C1]], [[C1_2]] : index
+// CHECK: [[V11:%.*]] = muli [[C1]], [[C4_1]] : index
+// CHECK: scf.parallel ([[V12:%.*]], [[V13:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V10]], [[V11]]) {
+// CHECK: [[V14:%.*]] = affine.min #map0([[V11]], [[C2]], [[V13]])
+// CHECK: scf.parallel ([[V15:%.*]], [[V16:%.*]]) = ([[C0_2]], [[C0_2]]) to ([[V10]], [[V14]]) step ([[C1]], [[C1]]) {
+// CHECK: = addi [[V15]], [[V12]] : index
+// CHECK: = addi [[V16]], [[V13]] : index
// CHECK: }
// CHECK: }
// CHECK: return
More information about the Mlir-commits
mailing list