[Mlir-commits] [mlir] [mlir][linalg] Emit a warning when tiling using forall generates non thread-safe code (PR #80813)
Pablo Antonio Martinez
llvmlistbot at llvm.org
Tue Feb 27 05:55:45 PST 2024
https://github.com/pabloantoniom updated https://github.com/llvm/llvm-project/pull/80813
>From 5d459622a7a91cf212b0ff1c512b8043364d94bc Mon Sep 17 00:00:00 2001
From: Pablo Antonio Martinez <pablo.antonio.martinez at huawei.com>
Date: Mon, 5 Feb 2024 16:57:59 +0000
Subject: [PATCH] [mlir][Linalg][Transform] Emit a warning when
tile_using_forall generates non thread-safe code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This warning aims to complement the comment in the documentation that
says:
"It is the user’s responsibility to ensure that num_threads/tile_sizes
is a valid tiling specification (i.e. that only tiles parallel
dimensions, e.g. in the Linalg case)."
because:
1. Not all users of tile_using_forall know that tiling the wrong
dimension/s (e.g., a non-parallel dimension) will generate non
thread-safe code, so this warning will inform the user about it.
2. Users of tile_using_forall may know this limitation, but they may
not realize that they are tiling a non-parallel dimension, so the
warning may help in the debugging process.
---
.../Linalg/TransformOps/LinalgTransformOps.td | 4 +-
mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp | 57 +++++++++-
mlir/test/Dialect/Linalg/tile-to-forall.mlir | 100 ++++++++++++++++++
3 files changed, 158 insertions(+), 3 deletions(-)
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 309573a562872f..e947720471f78c 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1922,7 +1922,9 @@ def TileUsingForallOp :
It is the user's responsibility to ensure that `num_threads/tile_sizes` is
a valid tiling specification (i.e. that only tiles parallel dimensions,
- e.g. in the Linalg case).
+ e.g. in the Linalg case). If the dimension is not parallelizable, a warning
+ is issued to notify the user that the generated code is not safe to
+ parallelize.
If non-empty, the `mapping` is added as an attribute to the
resulting `scf.forall`.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index 30aed850bed81e..ed97ad70e6e390 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -304,6 +304,50 @@ static void calculateTileOffsetsAndSizes(
}
}
+/// Returns a vector of bools representing if, for the given axis, `op` can be
+/// tiled by `numThreads` without incurring in a race condition and thus it is
+/// thread-safe to do the tiling. This is checked by iterating over the affine
+/// maps of the outputs in `op` and ensuring that all the results in the map are
+/// present in the affine map represented by the tiling sizes, which is derived
+/// from `numThreads` or `nominalTileSizes`.
+SmallVector<bool>
+safeToTileToForall(mlir::MLIRContext *ctx, TilingInterface op,
+ ArrayRef<OpFoldResult> numThreads,
+ std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,
+ int numDims) {
+ ArrayRef<OpFoldResult> tilingValues =
+ nominalTileSizes.has_value() ? *nominalTileSizes : numThreads;
+
+ SmallVector<bool> safeToTile(tilingValues.size(), true);
+ LinalgOp linalgOp = dyn_cast<LinalgOp>(op.getOperation());
+ if (!linalgOp)
+ return safeToTile;
+
+ SmallVector<AffineExpr> dimExprs;
+ dimExprs.reserve(numDims);
+ for (unsigned i = 0; i < tilingValues.size(); i++) {
+ if (auto attr = llvm::dyn_cast_if_present<Attribute>(tilingValues[i])) {
+ if (cast<IntegerAttr>(attr).getValue().getSExtValue() > 1)
+ dimExprs.push_back(mlir::getAffineDimExpr(i, ctx));
+ } else {
+ dimExprs.push_back(mlir::getAffineDimExpr(i, ctx));
+ }
+ }
+
+ for (uint32_t resNum = 0; resNum < op->getNumResults(); resNum++) {
+ AffineMap map =
+ linalgOp.getIndexingMapMatchingResult(op->getResult(resNum));
+
+ for (AffineExpr r : dimExprs) {
+ unsigned int axis = cast<AffineDimExpr>(r).getPosition();
+ if (!llvm::is_contained(map.getResults(), r))
+ safeToTile[axis] = false;
+ }
+ }
+
+ return safeToTile;
+}
+
/// Rewrite a TilingInterface `op` to a tiled `scf.forall`. The
/// tiling is specified by the number of tiles/threads `numThreads` and the
/// optional nominal tile size `nominalTileSizes`. If `nominalTilSizes` is
@@ -314,8 +358,10 @@ static void calculateTileOffsetsAndSizes(
/// size of data.
/// It is the user's responsibility to ensure that `numThreads` is a valid
/// tiling specification (i.e. that only tiles parallel dimensions, e.g. in the
-/// Linalg case). If `omitTileOffsetBoundsCheck` is true, then the function will
-/// assume that `tileSize[i] * (numThread[i] -1) <= dimSize[i]` holds.
+/// Linalg case). If the dimension is not parallelizable, a warning is issued to
+/// notify the user that the generated code is not safe to parallelize. If
+/// `omitTileOffsetBoundsCheck` is true, then the function will assume that
+/// `tileSize[i] * (numThread[i] -1) <= dimSize[i]` holds.
static FailureOr<ForallTilingResult> tileToForallOpImpl(
RewriterBase &b, TilingInterface op, ArrayRef<OpFoldResult> numThreads,
std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,
@@ -344,6 +390,13 @@ static FailureOr<ForallTilingResult> tileToForallOpImpl(
return getValueOrCreateConstantIndexOp(b, loc, ofr);
}));
+ // Check if tiling is thread safe and print a warning if not.
+ SmallVector<bool> tilingSafety = safeToTileToForall(
+ b.getContext(), op, numThreads, nominalTileSizes, loopRanges.size());
+ for (size_t i = 0; i < tilingSafety.size(); i++)
+ if (!tilingSafety[i])
+ op.emitWarning() << "tiling is not thread safe at axis #" << i;
+
// 1. Create the ForallOp. We don't use the lambda body-builder
// version because we require the use of RewriterBase in the body, so we
// manually move the insertion point to the body below.
diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
index abd807b3e4d3e1..e52f76c619575a 100644
--- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
@@ -586,3 +586,103 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+
+func.func @tile_thread_safety1(%arg0: tensor<100x300xf32>, %arg1: tensor<100xf32>) -> tensor<100xf32> {
+ // expected-warning at +1 {{tiling is not thread safe at axis #1}}
+ %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<100x300xf32>) outs(%arg1 : tensor<100xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %1 = arith.addf %in, %out : f32
+ linalg.yield %1 : f32
+ } -> tensor<100xf32>
+ return %0 : tensor<100xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [4, 2]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.yield
+ }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
+
+func.func @tile_thread_safety2(%arg0: tensor<100x300x8xf32>, %arg1: tensor<300x8xf32>) -> tensor<300x8xf32> {
+ // expected-warning at +1 {{tiling is not thread safe at axis #0}}
+ %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction", "parallel", "parallel"]} ins(%arg0 : tensor<100x300x8xf32>) outs(%arg1 : tensor<300x8xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %1 = arith.addf %in, %out : f32
+ linalg.yield %1 : f32
+ } -> tensor<300x8xf32>
+ return %0 : tensor<300x8xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [8]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.yield
+ }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+
+func.func @tile_thread_safety3(%arg0: tensor<100x300x8xf32>, %arg1: tensor<100x8xf32>) -> tensor<100x8xf32> {
+ // expected-warning at +1 {{tiling is not thread safe at axis #1}}
+ %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0 : tensor<100x300x8xf32>) outs(%arg1 : tensor<100x8xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %1 = arith.addf %in, %out : f32
+ linalg.yield %1 : f32
+ } -> tensor<100x8xf32>
+ return %0 : tensor<100x8xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [8, 4, 2]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.yield
+ }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d2)>
+
+func.func @tile_thread_safety4(%arg0: tensor<100x300x8xf32>, %arg1: tensor<100x8xf32>, %arg2 : tensor<8xf32>) -> (tensor<100x8xf32>, tensor<8xf32>) {
+ // expected-warning at +2 {{tiling is not thread safe at axis #0}}
+ // expected-warning at +1 {{tiling is not thread safe at axis #1}}
+ %0:2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel"]} ins(%arg0 : tensor<100x300x8xf32>) outs(%arg1, %arg2 : tensor<100x8xf32>, tensor<8xf32>) {
+ ^bb0(%in: f32, %out1: f32, %out2: f32):
+ %1 = arith.addf %in, %out1 : f32
+ %2 = arith.addf %in, %out2 : f32
+ linalg.yield %1, %2 : f32, f32
+ } -> (tensor<100x8xf32>, tensor<8xf32>)
+ return %0#0, %0#1 : tensor<100x8xf32>, tensor<8xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [8, 4, 2]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.yield
+ }
+}
+
More information about the Mlir-commits
mailing list