[Mlir-commits] [mlir] 5abf128 - Add a pass that specializes parallel loops for easier unrolling and vectorization
Benjamin Kramer
llvmlistbot at llvm.org
Fri Feb 28 10:49:29 PST 2020
Author: Benjamin Kramer
Date: 2020-02-28T19:47:23+01:00
New Revision: 5abf128d647df65d65e75fd26edb5268f30f6fe0
URL: https://github.com/llvm/llvm-project/commit/5abf128d647df65d65e75fd26edb5268f30f6fe0
DIFF: https://github.com/llvm/llvm-project/commit/5abf128d647df65d65e75fd26edb5268f30f6fe0.diff
LOG: Add a pass that specializes parallel loops for easier unrolling and vectorization
This matches loops with a affine.min upper bound, limiting the trip
count to a constant, and rewrites them into two loops, one with constant
upper bound and one with variable upper bound. The assumption is that
the constant upper bound loop will be unrolled and vectorized, which is
preferable if this is the hot path.
Differential Revision: https://reviews.llvm.org/D75240
Added:
mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp
mlir/test/Dialect/Loops/parallel-loop-specialization.mlir
Modified:
mlir/include/mlir/Dialect/LoopOps/Passes.h
mlir/include/mlir/InitAllPasses.h
mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/LoopOps/Passes.h b/mlir/include/mlir/Dialect/LoopOps/Passes.h
index b33b5ae3758f..54c0fb480b41 100644
--- a/mlir/include/mlir/Dialect/LoopOps/Passes.h
+++ b/mlir/include/mlir/Dialect/LoopOps/Passes.h
@@ -23,6 +23,10 @@ class Pass;
/// Creates a loop fusion pass which fuses parallel loops.
std::unique_ptr<Pass> createParallelLoopFusionPass();
+/// Creates a pass that specializes parallel loop for unrolling and
+/// vectorization.
+std::unique_ptr<Pass> createParallelLoopSpecializationPass();
+
/// Creates a pass which tiles innermost parallel loops.
std::unique_ptr<Pass>
createParallelLoopTilingPass(llvm::ArrayRef<int64_t> tileSize = {});
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index a1edc82f4001..2b21ccb613ac 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -109,6 +109,7 @@ inline void registerAllPasses() {
// LoopOps
createParallelLoopFusionPass();
+ createParallelLoopSpecializationPass();
createParallelLoopTilingPass();
// QuantOps
diff --git a/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
index 9dbd5a381a85..7f5b16f1feea 100644
--- a/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
add_llvm_library(MLIRLoopOpsTransforms
ParallelLoopFusion.cpp
+ ParallelLoopSpecialization.cpp
ParallelLoopTiling.cpp
ADDITIONAL_HEADER_DIRS
diff --git a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp
new file mode 100644
index 000000000000..8cb49f3428d6
--- /dev/null
+++ b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp
@@ -0,0 +1,76 @@
+//===- ParallelLoopSpecialization.cpp - loop.parallel specializeation -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Specializes parallel loops for easier unrolling and vectorization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/Dialect/LoopOps/Passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using loop::ParallelOp;
+
+/// Rewrite a loop with bounds defined by an affine.min with a constant into 2
+/// loops after checking if the bounds are equal to that constant. This is
+/// beneficial if the loop will almost always have the constant bound and that
+/// version can be fully unrolled and vectorized.
+static void specializeLoopForUnrolling(ParallelOp op) {
+ SmallVector<int64_t, 2> constantIndices;
+ constantIndices.reserve(op.upperBound().size());
+ for (auto bound : op.upperBound()) {
+ auto minOp = dyn_cast_or_null<AffineMinOp>(bound.getDefiningOp());
+ if (!minOp)
+ return;
+ int64_t minConstant = std::numeric_limits<int64_t>::max();
+ for (auto expr : minOp.map().getResults()) {
+ if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
+ minConstant = std::min(minConstant, constantIndex.getValue());
+ }
+ if (minConstant == std::numeric_limits<int64_t>::max())
+ return;
+ constantIndices.push_back(minConstant);
+ }
+
+ OpBuilder b(op);
+ BlockAndValueMapping map;
+ Value cond;
+ for (auto bound : llvm::zip(op.upperBound(), constantIndices)) {
+ Value constant = b.create<ConstantIndexOp>(op.getLoc(), std::get<1>(bound));
+ Value cmp = b.create<CmpIOp>(op.getLoc(), CmpIPredicate::eq,
+ std::get<0>(bound), constant);
+ cond = cond ? b.create<AndOp>(op.getLoc(), cond, cmp) : cmp;
+ map.map(std::get<0>(bound), constant);
+ }
+ auto ifOp = b.create<loop::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true);
+ ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
+ ifOp.getElseBodyBuilder().clone(*op.getOperation());
+ op.erase();
+}
+
+namespace {
+struct ParallelLoopSpecialization
+ : public FunctionPass<ParallelLoopSpecialization> {
+ void runOnFunction() override {
+ getFunction().walk([](ParallelOp op) { specializeLoopForUnrolling(op); });
+ }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() {
+ return std::make_unique<ParallelLoopSpecialization>();
+}
+
+static PassRegistration<ParallelLoopSpecialization>
+ pass("parallel-loop-specialization",
+ "Specialize parallel loops for vectorization.");
diff --git a/mlir/test/Dialect/Loops/parallel-loop-specialization.mlir b/mlir/test/Dialect/Loops/parallel-loop-specialization.mlir
new file mode 100644
index 000000000000..ab736c985986
--- /dev/null
+++ b/mlir/test/Dialect/Loops/parallel-loop-specialization.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-opt %s -parallel-loop-specialization -split-input-file | FileCheck %s --dump-input-on-failure
+
+#map0 = affine_map<()[s0, s1] -> (1024, s0 - s1)>
+#map1 = affine_map<()[s0, s1] -> (64, s0 - s1)>
+
+func @parallel_loop(%outer_i0: index, %outer_i1: index, %A: memref<?x?xf32>, %B: memref<?x?xf32>,
+ %C: memref<?x?xf32>, %result: memref<?x?xf32>) {
+ %c0 = constant 0 : index
+ %c1 = constant 1 : index
+ %d0 = dim %A, 0 : memref<?x?xf32>
+ %d1 = dim %A, 1 : memref<?x?xf32>
+ %b0 = affine.min #map0()[%d0, %outer_i0]
+ %b1 = affine.min #map1()[%d1, %outer_i1]
+ loop.parallel (%i0, %i1) = (%c0, %c0) to (%b0, %b1) step (%c1, %c1) {
+ %B_elem = load %B[%i0, %i1] : memref<?x?xf32>
+ %C_elem = load %C[%i0, %i1] : memref<?x?xf32>
+ %sum_elem = addf %B_elem, %C_elem : f32
+ store %sum_elem, %result[%i0, %i1] : memref<?x?xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: func @parallel_loop(
+// CHECK-SAME: [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: memref<?x?xf32>, [[VAL_3:%.*]]: memref<?x?xf32>, [[VAL_4:%.*]]: memref<?x?xf32>, [[VAL_5:%.*]]: memref<?x?xf32>) {
+// CHECK: [[VAL_6:%.*]] = constant 0 : index
+// CHECK: [[VAL_7:%.*]] = constant 1 : index
+// CHECK: [[VAL_8:%.*]] = dim [[VAL_2]], 0 : memref<?x?xf32>
+// CHECK: [[VAL_9:%.*]] = dim [[VAL_2]], 1 : memref<?x?xf32>
+// CHECK: [[VAL_10:%.*]] = affine.min #map0(){{\[}}[[VAL_8]], [[VAL_0]]]
+// CHECK: [[VAL_11:%.*]] = affine.min #map1(){{\[}}[[VAL_9]], [[VAL_1]]]
+// CHECK: [[VAL_12:%.*]] = constant 1024 : index
+// CHECK: [[VAL_13:%.*]] = cmpi "eq", [[VAL_10]], [[VAL_12]] : index
+// CHECK: [[VAL_14:%.*]] = constant 64 : index
+// CHECK: [[VAL_15:%.*]] = cmpi "eq", [[VAL_11]], [[VAL_14]] : index
+// CHECK: [[VAL_16:%.*]] = and [[VAL_13]], [[VAL_15]] : i1
+// CHECK: loop.if [[VAL_16]] {
+// CHECK: loop.parallel ([[VAL_17:%.*]], [[VAL_18:%.*]]) = ([[VAL_6]], [[VAL_6]]) to ([[VAL_12]], [[VAL_14]]) step ([[VAL_7]], [[VAL_7]]) {
+// CHECK: store
+// CHECK: }
+// CHECK: } else {
+// CHECK: loop.parallel ([[VAL_22:%.*]], [[VAL_23:%.*]]) = ([[VAL_6]], [[VAL_6]]) to ([[VAL_10]], [[VAL_11]]) step ([[VAL_7]], [[VAL_7]]) {
+// CHECK: store
+// CHECK: }
+// CHECK: }
+// CHECK: return
+// CHECK: }
More information about the Mlir-commits
mailing list