[Mlir-commits] [mlir] 7c97e32 - [mlir][linalg] Fix generic reduction vectorization
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Oct 12 15:47:10 PDT 2021
Author: thomasraoux
Date: 2021-10-12T15:46:04-07:00
New Revision: 7c97e328b3b4f5bdf25359057e0aa898aa0a05ca
URL: https://github.com/llvm/llvm-project/commit/7c97e328b3b4f5bdf25359057e0aa898aa0a05ca
DIFF: https://github.com/llvm/llvm-project/commit/7c97e328b3b4f5bdf25359057e0aa898aa0a05ca.diff
LOG: [mlir][linalg] Fix generic reduction vectorization
We shouldn't broadcast the original value when doing reduction. Instead
we compute the reduction and then combine it with the original value.
Differential Revision: https://reviews.llvm.org/D111666
Added:
Modified:
mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
mlir/test/Dialect/Linalg/vectorization.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index a1ea54213cd1..652bf7fba22e 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -134,14 +134,13 @@ getKindForOp(Operation *reductionOp) {
}
/// Check whether `outputOperand` is a reduction with a single combiner
-/// operation. Return the combiner operation kind of the reduction, if
-/// supported. Return llvm::None, otherwise. Multiple reduction operations would
-/// impose an ordering between reduction dimensions and is currently unsupported
-/// in Linalg. This limitation is motivated by the fact that e.g. min(max(X)) !=
+/// operation. Return the combiner operation of the reduction. Return
+/// nullptr otherwise. Multiple reduction operations would impose an
+/// ordering between reduction dimensions and is currently unsupported in
+/// Linalg. This limitation is motivated by the fact that e.g. min(max(X)) !=
/// max(min(X))
// TODO: use in LinalgOp verification, there is a circular dependency atm.
-static llvm::Optional<vector::CombiningKind>
-matchLinalgReduction(OpOperand *outputOperand) {
+static Operation *matchLinalgReduction(OpOperand *outputOperand) {
auto linalgOp = cast<LinalgOp>(outputOperand->getOwner());
unsigned outputPos =
outputOperand->getOperandNumber() - linalgOp.getNumInputs();
@@ -149,10 +148,10 @@ matchLinalgReduction(OpOperand *outputOperand) {
SmallVector<Operation *, 4> combinerOps;
if (!matchReduction(linalgOp.getRegionOutputArgs(), outputPos, combinerOps) ||
combinerOps.size() != 1)
- return llvm::None;
+ return nullptr;
- // Return the combiner operation kind, if supported.
- return getKindForOp(combinerOps[0]);
+ // Return the combiner operation.
+ return combinerOps[0];
}
/// Broadcast `value` to a vector of `shape` if possible. Return value
@@ -171,11 +170,60 @@ static Value broadcastIfNeeded(OpBuilder &b, Value value,
return b.createOrFold<vector::BroadcastOp>(loc, targetVectorType, value);
}
+/// Build a vector.transfer_read from `source` at indices set to all `0`.
+/// If source has rank zero, build a `vector<1xt> transfer_read + extract`.
+/// Return the produced value.
+static Value buildVectorRead(OpBuilder &b, Value source, Type readType,
+ AffineMap map) {
+ Location loc = source.getLoc();
+ auto shapedType = source.getType().cast<ShapedType>();
+ SmallVector<Value> indices(shapedType.getRank(),
+ b.create<ConstantIndexOp>(loc, 0));
+ if (auto vectorType = readType.dyn_cast<VectorType>())
+ return b.create<vector::TransferReadOp>(loc, vectorType, source, indices,
+ map);
+ return vector::TransferReadOp::createScalarOp(b, loc, source, indices);
+}
+
+/// Create MultiDimReductionOp to compute the reduction for `reductionOp`. This
+/// assumes that `reductionOp` has tow operands and one of them is the reduction
+/// initial value.
+static Value buildMultiDimReduce(OpBuilder &b, Operation *reduceOp,
+ Value outputArg,
+ const SmallVector<bool> &reductionMask,
+ const BlockAndValueMapping &bvm) {
+ auto maybeKind = getKindForOp(reduceOp);
+ assert(maybeKind && "Failed precondition: could not get reduction kind");
+ Value operandToReduce = reduceOp->getOperand(0) == outputArg
+ ? reduceOp->getOperand(1)
+ : reduceOp->getOperand(0);
+ Value vec = bvm.lookup(operandToReduce);
+ return b.create<vector::MultiDimReductionOp>(reduceOp->getLoc(), vec,
+ reductionMask, *maybeKind);
+}
+
+/// Read the initial value associated to the given `outputOperand`.
+static Value readInitialValue(OpBuilder &b, LinalgOp linalgOp,
+ OpOperand *outputOperand) {
+ AffineMap map = inversePermutation(
+ reindexIndexingMap(linalgOp.getTiedIndexingMap(outputOperand)));
+ Type readType;
+ if (linalgOp.getShape(outputOperand).empty()) {
+ readType = getElementTypeOrSelf(outputOperand->get());
+ } else {
+ readType = VectorType::get(map.compose(linalgOp.getShape(outputOperand)),
+ getElementTypeOrSelf(outputOperand->get()));
+ }
+ Value vectorRead = buildVectorRead(b, outputOperand->get(), readType, map);
+ return vectorRead;
+}
+
/// Assuming `outputOperand` is an output operand of a LinalgOp, determine
/// whether a reduction is needed to produce a `targetType` and create that
/// reduction if it is the case.
static Value reduceIfNeeded(OpBuilder &b, Type targetType, Value value,
- OpOperand *outputOperand) {
+ OpOperand *outputOperand,
+ const BlockAndValueMapping &bvm) {
LDBG("Reduce " << value << " to type " << targetType);
LDBG("In LinalgOp operand #" << outputOperand->getOperandNumber() << "\n"
<< *(outputOperand->getOwner()));
@@ -194,10 +242,9 @@ static Value reduceIfNeeded(OpBuilder &b, Type targetType, Value value,
for (auto s : linalgOp.iterator_types())
if (isParallelIterator(s))
exprs.push_back(getAffineDimExpr(pos++, ctx));
- auto loc = value.getLoc();
- auto maybeKind = matchLinalgReduction(outputOperand);
- assert(maybeKind && "Failed precondition: could not get reduction kind");
+ Operation *reduceOp = matchLinalgReduction(outputOperand);
+ assert(reduceOp && "Failed precondition: could not math a reduction");
unsigned idx = 0;
SmallVector<bool> reductionMask(linalgOp.iterator_types().size(), false);
for (auto attr : linalgOp.iterator_types()) {
@@ -205,23 +252,24 @@ static Value reduceIfNeeded(OpBuilder &b, Type targetType, Value value,
reductionMask[idx] = true;
++idx;
}
- return b.create<vector::MultiDimReductionOp>(loc, value, reductionMask,
- *maybeKind);
-}
-
-/// Build a vector.transfer_read from `source` at indices set to all `0`.
-/// If source has rank zero, build a `vector<1xt> transfer_read + extract`.
-/// Return the produced value.
-static Value buildVectorRead(OpBuilder &b, Value source, Type readType,
- AffineMap map) {
- Location loc = source.getLoc();
- auto shapedType = source.getType().cast<ShapedType>();
- SmallVector<Value> indices(shapedType.getRank(),
- b.create<ConstantIndexOp>(loc, 0));
- if (auto vectorType = readType.dyn_cast<VectorType>())
- return b.create<vector::TransferReadOp>(loc, vectorType, source, indices,
- map);
- return vector::TransferReadOp::createScalarOp(b, loc, source, indices);
+ assert(reduceOp->getNumOperands() == 2 &&
+ "Only support binary reduce op right now");
+ unsigned outputPos =
+ outputOperand->getOperandNumber() - linalgOp.getNumInputs();
+ Value outputArg = linalgOp.getRegionOutputArgs()[outputPos];
+ // Reduce across the iteration space.
+ Value reduce =
+ buildMultiDimReduce(b, reduceOp, outputArg, reductionMask, bvm);
+
+ // Read the original output value.
+ Value initialValue = readInitialValue(b, linalgOp, outputOperand);
+
+ // Combine the output argument with the reduced value.
+ OperationState state(reduceOp->getLoc(), reduceOp->getName());
+ state.addAttributes(reduceOp->getAttrs());
+ state.addOperands({reduce, initialValue});
+ state.addTypes(initialValue.getType());
+ return b.createOperation(state)->getResult(0);
}
/// Build a vector.transfer_write of `value` into `outputOperand` at indices set
@@ -229,7 +277,8 @@ static Value buildVectorRead(OpBuilder &b, Value source, Type readType,
/// currently being vectorized. If `dest` has null rank, build an memref.store.
/// Return the produced value or null if no value is produced.
static Value buildVectorWrite(OpBuilder &b, Value value,
- OpOperand *outputOperand) {
+ OpOperand *outputOperand,
+ const BlockAndValueMapping &bvm) {
Operation *write;
Location loc = value.getLoc();
auto linalgOp = cast<LinalgOp>(outputOperand->getOwner());
@@ -244,12 +293,12 @@ static Value buildVectorWrite(OpBuilder &b, Value value,
SmallVector<Value> indices(linalgOp.getRank(outputOperand),
b.create<ConstantIndexOp>(loc, 0));
value = broadcastIfNeeded(b, value, vectorType.getShape());
- value = reduceIfNeeded(b, vectorType, value, outputOperand);
+ value = reduceIfNeeded(b, vectorType, value, outputOperand, bvm);
write = b.create<vector::TransferWriteOp>(loc, value, outputOperand->get(),
indices, map);
} else {
- value =
- reduceIfNeeded(b, getElementTypeOrSelf(value), value, outputOperand);
+ value = reduceIfNeeded(b, getElementTypeOrSelf(value), value, outputOperand,
+ bvm);
write = vector::TransferWriteOp::createScalarOp(
b, loc, value, outputOperand->get(), ValueRange{});
}
@@ -284,7 +333,7 @@ vectorizeLinalgYield(OpBuilder &b, Operation *op,
// TODO: use a map.
Value vectorValue = bvm.lookup(outputs.value());
Value newResult = buildVectorWrite(
- b, vectorValue, linalgOp.getOutputOperand(outputs.index()));
+ b, vectorValue, linalgOp.getOutputOperand(outputs.index()), bvm);
if (newResult)
newResults.push_back(newResult);
}
@@ -611,7 +660,8 @@ static LogicalResult reductionPreconditions(LinalgOp op) {
return failure();
}
for (OpOperand *opOperand : op.getOutputOperands()) {
- if (!matchLinalgReduction(opOperand)) {
+ Operation *reduceOp = matchLinalgReduction(opOperand);
+ if (!reduceOp || !getKindForOp(reduceOp)) {
LDBG("reduction precondition failed: reduction detection failed");
return failure();
}
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
index d3aa8c3a2953..ea99d00f0c8d 100644
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -744,17 +744,15 @@ func @pad_tensor_non_const_pad_value(%arg0: tensor<5x6xf32>) -> tensor<12x13xf32
// -----
-// CHECK-DAG: #[[$M0:.*]] = affine_map<(d0, d1) -> (d0, d1, 0)>
-
// CHECK-LABEL: func @sum_exp
func @sum_exp(%input: tensor<4x16x8xf32>, %output: tensor<4x16xf32>)
-> tensor<4x16xf32>
{
// CHECK: vector.transfer_read {{.*}} : tensor<4x16x8xf32>, vector<4x16x8xf32>
- // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true, true], permutation_map = #[[$M0]]} : tensor<4x16xf32>, vector<4x16x8xf32>
// CHECK: math.exp {{.*}} : vector<4x16x8xf32>
- // CHECK: addf {{.*}} : vector<4x16x8xf32>
// CHECK: vector.multi_reduction #vector.kind<add>, %{{.*}} [2] : vector<4x16x8xf32> to vector<4x16xf32>
+ // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<4x16xf32>, vector<4x16xf32>
+ // CHECK: addf {{.*}} : vector<4x16xf32>
// CHECK: vector.transfer_write {{.*}} : vector<4x16xf32>, tensor<4x16xf32>
// CHECK: return {{.*}} : tensor<4x16xf32>
%0 = linalg.generic {
@@ -776,8 +774,7 @@ func @sum_exp(%input: tensor<4x16x8xf32>, %output: tensor<4x16xf32>)
// CHECK-DAG: #[[$M1:.*]] = affine_map<(d0, d1) -> (d1, d0, 0, 0)>
// CHECK-DAG: #[[$M2:.*]] = affine_map<(d0, d1) -> (0, 0, d1, d0)>
-// CHECK-DAG: #[[$M3:.*]] = affine_map<(d0, d1) -> (d1, 0, 0, d0)>
-// CHECK-DAG: #[[$M4:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+// CHECK-DAG: #[[$M3:.*]] = affine_map<(d0, d1) -> (d1, d0)>
// CHECK-LABEL: func @sum_exp_2
func @sum_exp_2(%input: tensor<3x2xf32>, %input_2: tensor<5x4xf32>, %output: tensor<5x2xf32>)
@@ -785,13 +782,13 @@ func @sum_exp_2(%input: tensor<3x2xf32>, %input_2: tensor<5x4xf32>, %output: ten
{
// CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true, true, true], permutation_map = #[[$M1]]} : tensor<3x2xf32>, vector<2x3x4x5xf32>
// CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true, true, true], permutation_map = #[[$M2]]} : tensor<5x4xf32>, vector<2x3x4x5xf32>
- // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true, true, true], permutation_map = #[[$M3]]} : tensor<5x2xf32>, vector<2x3x4x5xf32>
// CHECK: math.exp {{.*}} : vector<2x3x4x5xf32>
// CHECK: math.exp {{.*}} : vector<2x3x4x5xf32>
// CHECK: addf {{.*}} : vector<2x3x4x5xf32>
- // CHECK: addf {{.*}} : vector<2x3x4x5xf32>
// CHECK: vector.multi_reduction #vector.kind<add>, {{.*}} [1, 2] : vector<2x3x4x5xf32> to vector<2x5xf32>
- // CHECK: vector.transfer_write {{.*}} {in_bounds = [true, true], permutation_map = #[[$M4]]} : vector<2x5xf32>, tensor<5x2xf32>
+ // CHECK: vector.transfer_read {{.*}} {in_bounds = [true, true], permutation_map = #[[$M3]]} : tensor<5x2xf32>, vector<2x5xf32>
+ // CHECK: addf {{.*}} : vector<2x5xf32>
+ // CHECK: vector.transfer_write {{.*}} {in_bounds = [true, true], permutation_map = #[[$M3]]} : vector<2x5xf32>, tensor<5x2xf32>
// CHECK: return {{.*}} : tensor<5x2xf32>
%0 = linalg.generic {
indexing_maps = [
@@ -815,12 +812,11 @@ func @sum_exp_2(%input: tensor<3x2xf32>, %input_2: tensor<5x4xf32>, %output: ten
// CHECK-LABEL: func @red_max_2d(
func @red_max_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
+ // CHECK: %[[CMINF:.+]] = constant dense<-3.402820e+38> : vector<4xf32>
// CHECK: linalg.init_tensor [4] : tensor<4xf32>
// CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
- // CHECK: vector.transfer_read {{.*}} : tensor<4x4xf32>, vector<4x4xf32>
- // CHECK: vector.transfer_read {{.*}} : tensor<4xf32>, vector<4x4xf32>
- // CHECK: maxf {{.*}} : vector<4x4xf32>
- // CHECK: vector.multi_reduction #vector.kind<maxf>, {{.*}} [1] : vector<4x4xf32> to vector<4xf32>
+ // CHECK: %[[R:.+]] = vector.multi_reduction #vector.kind<maxf>, {{.*}} [1] : vector<4x4xf32> to vector<4xf32>
+ // CHECK: maxf %[[R]], %[[CMINF]] : vector<4xf32>
// CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
%ident = constant -3.40282e+38 : f32
%init = linalg.init_tensor [4] : tensor<4xf32>
@@ -840,12 +836,12 @@ func @red_max_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
// CHECK-LABEL: func @red_min_2d(
func @red_min_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
+ // CHECK: %[[CMAXF:.+]] = constant dense<3.402820e+38> : vector<4xf32>
// CHECK: linalg.init_tensor [4] : tensor<4xf32>
// CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
// CHECK: vector.transfer_read {{.*}} : tensor<4x4xf32>, vector<4x4xf32>
- // CHECK: vector.transfer_read {{.*}} : tensor<4xf32>, vector<4x4xf32>
- // CHECK: minf {{.*}} : vector<4x4xf32>
- // CHECK: vector.multi_reduction #vector.kind<minf>, {{.*}} [1] : vector<4x4xf32> to vector<4xf32>
+ // CHECK: %[[R:.+]] = vector.multi_reduction #vector.kind<minf>, {{.*}} [1] : vector<4x4xf32> to vector<4xf32>
+ // CHECK: minf %[[R]], %[[CMAXF]] : vector<4xf32>
// CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
%maxf32 = constant 3.40282e+38 : f32
%init = linalg.init_tensor [4] : tensor<4xf32>
@@ -855,7 +851,7 @@ func @red_min_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
iterator_types = ["parallel", "reduction"]}
ins(%arg0 : tensor<4x4xf32>) outs(%fill : tensor<4xf32>) {
^bb0(%in0: f32, %out0: f32): // no predecessors
- %min = minf %in0, %out0 : f32
+ %min = minf %out0, %in0 : f32
linalg.yield %min : f32
} -> tensor<4xf32>
return %red : tensor<4xf32>
@@ -1026,7 +1022,7 @@ func @fused_broadcast_red_2d(%arg0: tensor<4x4xf32>, %arg1: tensor<4x1xf32>) ->
// CHECK-SAME: %[[A:.*]]: tensor<32xf32>
func @reduce_1d(%arg0: tensor<32xf32>) -> tensor<f32> {
// CHECK-DAG: %[[F0_v1:.*]] = constant dense<0.000000e+00> : vector<1xf32>
- // CHECK-DAG: %[[F0_v32:.*]] = constant dense<0.000000e+00> : vector<32xf32>
+ // CHECK-DAG: %[[F0:.*]] = constant 0.000000e+00 : f32
// CHECK-DAG: %[[C0:.*]] = constant 0 : index
%f0 = constant 0.000000e+00 : f32
@@ -1036,13 +1032,12 @@ func @reduce_1d(%arg0: tensor<32xf32>) -> tensor<f32> {
// CHECK: %[[f:.*]] = vector.transfer_write %[[F0_v1]], %[[init]][]
// CHECK-SAME: : vector<1xf32>, tensor<f32>
%1 = linalg.fill(%f0, %0) : f32, tensor<f32> -> tensor<f32>
-
// CHECK: %[[r:.*]] = vector.transfer_read %[[A]][%[[C0]]]
// CHECK-SAME: : tensor<32xf32>, vector<32xf32>
- // CHECK: %[[a:.*]] = addf %[[r]], %[[F0_v32]] : vector<32xf32>
- // CHECK: %[[red:.*]] = vector.multi_reduction #vector.kind<add>, %[[a]] [0]
+ // CHECK: %[[red:.*]] = vector.multi_reduction #vector.kind<add>, %[[r]] [0]
// CHECK-SAME: : vector<32xf32> to f32
- // CHECK: %[[red_v1:.*]] = vector.broadcast %[[red]] : f32 to vector<1xf32>
+ // CHECK: %[[a:.*]] = addf %[[red]], %[[F0]] : f32
+ // CHECK: %[[red_v1:.*]] = vector.broadcast %[[a]] : f32 to vector<1xf32>
// CHECK: %[[res:.*]] = vector.transfer_write %[[red_v1]], %[[f]][]
// CHECK-SAME: : vector<1xf32>, tensor<f32>
%2 = linalg.generic {
More information about the Mlir-commits
mailing list