[Mlir-commits] [mlir] f0a411d - [mlir][Transform]Significantly cleanup scf.foreach_thread and GPU transform permutation handling
Nicolas Vasilache
llvmlistbot at llvm.org
Mon Nov 14 09:19:54 PST 2022
Author: Nicolas Vasilache
Date: 2022-11-14T09:19:49-08:00
New Revision: f0a411da771813bc034d7240f3f7d5e0ae1cb67a
URL: https://github.com/llvm/llvm-project/commit/f0a411da771813bc034d7240f3f7d5e0ae1cb67a
DIFF: https://github.com/llvm/llvm-project/commit/f0a411da771813bc034d7240f3f7d5e0ae1cb67a.diff
LOG: [mlir][Transform]Significantly cleanup scf.foreach_thread and GPU transform permutation handling
Previously, the need for a dense permutation leaked into the thread_dim_mapping specification.
This revision allows to use a sparse specification of the thread_dim_mapping and the proper completion / sorting is applied automatically.
In the process, the sematics of scf.foreach_thread is tightened to require a matching number of thread dimensions and mappings.
The relevant negative test is added.
Differential Revision: https://reviews.llvm.org/D137906
Added:
Modified:
mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
mlir/lib/Dialect/SCF/IR/SCF.cpp
mlir/test/Dialect/GPU/transform-gpu-failing.mlir
mlir/test/Dialect/GPU/transform-gpu.mlir
mlir/test/Dialect/SCF/invalid.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index 3fa890b933955..67a8e438d583b 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -536,15 +536,14 @@ def ForeachThreadOp : SCF_Op<"foreach_thread", [
return getBody()->getArguments().drop_front(getRank());
}
- /// Return the thread indices in the order specified by the
- /// given mapping argument. Return failure is
- /// mapping is not a valid permutation.
- FailureOr<SmallVector<Value>> getPermutedThreadIndices(ArrayRef<int64_t> mapping);
-
- /// Return the number of threads in the order specified by the
- /// given mapping argument.
- /// Return failure is mapping is not a valid permutation.
- FailureOr<SmallVector<OpFoldResult>> getPermutedNumThreads(OpBuilder &b, ArrayRef<int64_t> mapping);
+ /// Helper to sort `values` according to matching `keys`.
+ /// Take a custom `compare` binary comparator which returns true if the first
+ /// element is smaller than the second (i.e. compatible with std::sort).
+ /// This is a helper typically used to sort numThreads values before they are
+ /// mapped to concrete physical dimensions of hardware.
+ static SmallVector<Value> getValuesSortedByKey(
+ ArrayRef<Attribute> keys, ValueRange values,
+ llvm::function_ref<bool(Attribute, Attribute)> compare);
// The ensureTerminator method generated by SingleBlockImplicitTerminator is
// unaware of the fact that our terminator also needs a region to be
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index 460eb2f95fbc3..5fb90ac45ce13 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -15,6 +15,7 @@
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/Value.h"
#include "llvm/ADT/None.h"
@@ -157,45 +158,75 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImpl(
SmallVectorImpl<Value> &)>
blockIdGenerator,
SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp) {
+ // Step 0. Target-specific verifications. There is no good place to anchor
+ // those right now: the ForeachThreadOp is target-independent and the
+ // transform op does not apply to individual ForeachThreadOp.
+ MLIRContext *ctx = foreachThreadOp->getContext();
+ Location loc = foreachThreadOp->getLoc();
+ Attribute bX = GPUBlockMappingAttr::get(ctx, Blocks::DimX);
+ Attribute bY = GPUBlockMappingAttr::get(ctx, Blocks::DimY);
+ Attribute bZ = GPUBlockMappingAttr::get(ctx, Blocks::DimZ);
if (foreachThreadOp.getNumResults() > 0)
return transformOp.emitSilenceableError()
- << "only bufferized scf.foreach_thread lowers to gpu.block_id";
+ << "only bufferized scf.foreach_thread lowers to "
+ "gpu.block_id";
if (foreachThreadOp.getNumThreads().size() > 3)
return transformOp.emitSilenceableError()
- << "scf.foreach_thread with rank > 3 does not lower to gpu.block_id";
-
- // Step 0. Outline the compute workload region and set up the workload
- // operands.
- SmallVector<int64_t> mapping;
+ << "scf.foreach_thread with rank > 3 does not lower to "
+ "gpu.block_id";
+ if (llvm::any_of(foreachThreadOp.getNumThreads(), [](Value v) {
+ return !v.getDefiningOp<arith::ConstantIndexOp>();
+ })) {
+ return transformOp.emitSilenceableError()
+ << "unsupported dynamic griddim size";
+ }
if (!foreachThreadOp.getMapping().has_value())
return transformOp.emitSilenceableError() << "mapping must be present";
- for (DeviceMappingAttrInterface map :
- foreachThreadOp.getMapping()->getValue()) {
- if (auto blockMap = map.dyn_cast<GPUBlockMappingAttr>()) {
- mapping.push_back((int64_t)blockMap.getBlock());
- } else {
- return transformOp.emitSilenceableError()
- << "mapping must be #gpu.block<x/y/z/>";
- }
+ SmallVector<Attribute> blockMapping =
+ llvm::to_vector(foreachThreadOp.getMapping()->getValue());
+ if (llvm::any_of(blockMapping, [](DeviceMappingAttrInterface map) {
+ return !map.isa<GPUBlockMappingAttr>();
+ })) {
+ return transformOp.emitSilenceableError()
+ << "mapping must be #gpu.block<x/y/z/>";
}
- FailureOr<SmallVector<OpFoldResult>> potentialGridDim =
- foreachThreadOp.getPermutedNumThreads(rewriter, mapping);
-
- if (failed(potentialGridDim) ||
- llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) {
- return !getConstantIntValue(ofr).has_value();
- })) {
- return transformOp.emitSilenceableError() << "unsupported dynamic gridDim";
+ // Step 1. Complete the blockMapping to a full mapping (with 1s) if necessary.
+ SmallVector<Value> numBlocks =
+ llvm::to_vector(foreachThreadOp.getNumThreads());
+ // Ensure we have 3 block sizes, one for each id.
+ Value one;
+ for (auto attr : {bX, bY, bZ}) {
+ if (std::find(blockMapping.begin(), blockMapping.end(), attr) ==
+ blockMapping.end()) {
+ blockMapping.push_back(attr);
+ one = one ? one : rewriter.create<arith::ConstantIndexOp>(loc, 1);
+ numBlocks.push_back(one);
+ }
}
- for (OpFoldResult ofr : *potentialGridDim)
- gridDims.push_back(getConstantIntValue(ofr).value());
+ // Step 2. sort the values by the corresponding GPUBlockMappingAttr.
+ auto comparator = [](Attribute a, Attribute b) -> bool {
+ return static_cast<int64_t>(a.cast<GPUBlockMappingAttr>().getBlock()) <
+ static_cast<int64_t>(b.cast<GPUBlockMappingAttr>().getBlock());
+ };
+ SmallVector<Value> gridDimValues = scf::ForeachThreadOp::getValuesSortedByKey(
+ blockMapping, numBlocks, comparator);
+ for (Value v : gridDimValues)
+ gridDims.push_back(v.getDefiningOp<arith::ConstantIndexOp>().value());
+ // Step 3. Generate the blockIds using the provided generator and map the
+ // induction variables to the newly created ops.
SmallVector<Value> blockOps;
blockIdGenerator(rewriter, foreachThreadOp, blockOps);
+ BlockAndValueMapping bvm;
+ for (auto [blockIdx, blockDim] :
+ llvm::zip(foreachThreadOp.getThreadIndices(), blockMapping)) {
+ bvm.map(blockIdx, blockOps[static_cast<int64_t>(
+ blockDim.cast<GPUBlockMappingAttr>().getBlock())]);
+ }
- // Step 1. Move the body of foreachThreadOp.
+ // Step 4. Move the body of foreachThreadOp.
// Erase the terminator first, it will not be used since we are on buffers.
rewriter.eraseOp(foreachThreadOp.getTerminator());
Block *targetBlock = foreachThreadOp->getBlock();
@@ -204,20 +235,16 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImpl(
targetBlock->getOperations().splice(insertionPoint,
sourceBlock.getOperations());
- // Step 2. RAUW thread indices to thread ops.
- SmallVector<Value> threadIndices =
- *foreachThreadOp.getPermutedThreadIndices(mapping);
- assert(blockOps.size() == 3 && "3 block id ops are required");
- for (auto [blockIdx, blockOp] : llvm::zip(threadIndices, blockOps)) {
- Value val = blockIdx;
- Value blkOp = blockOp;
- if (!val)
- continue;
- for (Operation *user : llvm::make_early_inc_range(val.getUsers()))
- user->replaceUsesOfWith(val, blkOp);
+ // Step 5. RAUW thread indices to thread ops.
+ for (Value blockIdx : foreachThreadOp.getThreadIndices()) {
+ for (Operation *user : llvm::make_early_inc_range(blockIdx.getUsers())) {
+ rewriter.updateRootInPlace(user, [&]() {
+ user->replaceUsesOfWith(blockIdx, bvm.lookup(blockIdx));
+ });
+ }
}
- // Step 3. Erase old op.
+ // Step 6. Erase old op.
rewriter.eraseOp(foreachThreadOp);
return DiagnosedSilenceableFailure::success();
@@ -252,11 +279,10 @@ static void generateGpuBlockIds(RewriterBase &rewriter,
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(foreachOp);
IndexType indexType = rewriter.getIndexType();
- SmallVector<Dimension> gpuDims{Dimension::x, Dimension::y, Dimension::z};
- for (int64_t idx : llvm::seq<int64_t>(0, gpuDims.size())) {
- blockOps.push_back(
- rewriter.create<BlockIdOp>(loc, indexType, gpuDims[idx]));
- }
+ blockOps = SmallVector<Value>{
+ rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
+ rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
+ rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)};
}
DiagnosedSilenceableFailure
@@ -333,6 +359,9 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
const SmallVectorImpl<int64_t> &globalBlockDims, bool syncAfterDistribute,
llvm::Optional<TransformOpInterface> transformOp) {
+ // Step 0. Target-specific verifications. There is no good place to anchor
+ // those right now: the ForeachThreadOp is target-independent and the
+ // transform op does not apply to individual ForeachThreadOp.
auto failureHelper =
[&](const Twine &message) -> DiagnosedSilenceableFailure {
if (transformOp.has_value()) {
@@ -340,54 +369,79 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
}
return emitDefiniteFailure(foreachThreadOp, message);
};
-
+ MLIRContext *ctx = foreachThreadOp->getContext();
+ Location loc = foreachThreadOp->getLoc();
+ Attribute tX = GPUThreadMappingAttr::get(ctx, Threads::DimX);
+ Attribute tY = GPUThreadMappingAttr::get(ctx, Threads::DimY);
+ Attribute tZ = GPUThreadMappingAttr::get(ctx, Threads::DimZ);
if (foreachThreadOp.getNumResults() > 0)
return failureHelper(
"only bufferized scf.foreach_thread lowers to gpu.thread_id");
-
if (foreachThreadOp.getNumThreads().size() > 3)
return failureHelper(
"scf.foreach_thread with rank > 3 does not lower to gpu.thread_id");
-
- SmallVector<int64_t> mapping;
+ if (llvm::any_of(foreachThreadOp.getNumThreads(), [](Value v) {
+ return !v.getDefiningOp<arith::ConstantIndexOp>();
+ })) {
+ return failureHelper("unsupported dynamic blockdim size");
+ }
if (!foreachThreadOp.getMapping().has_value())
return failureHelper("mapping must be present");
- for (DeviceMappingAttrInterface map :
- foreachThreadOp.getMapping()->getValue()) {
- if (auto threadMap = map.dyn_cast<GPUThreadMappingAttr>()) {
- mapping.push_back((int64_t)threadMap.getThread());
- } else {
- return failureHelper("mapping must be #gpu.thread<x/y/z/>");
- }
- }
- FailureOr<SmallVector<OpFoldResult>> potentialBlockDim =
- foreachThreadOp.getPermutedNumThreads(rewriter, mapping);
- if (failed(potentialBlockDim) ||
- llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) {
- return !getConstantIntValue(ofr).has_value();
+ SmallVector<Attribute> threadMapping =
+ llvm::to_vector(foreachThreadOp.getMapping()->getValue());
+ if (llvm::any_of(threadMapping, [](DeviceMappingAttrInterface map) {
+ return !map.isa<GPUThreadMappingAttr>();
})) {
- return failureHelper("unsupported dynamic blockdim size");
+ return transformOp->emitSilenceableError()
+ << "mapping must be #gpu.thread<x/y/z/>";
}
- SmallVector<int64_t> blockDim =
- llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) {
- return getConstantIntValue(ofr).value();
+ // Step 1. Complete the threadMapping to a full mapping (with 1s) if
+ // necessary.
+ SmallVector<Value> numThreads =
+ llvm::to_vector(foreachThreadOp.getNumThreads());
+ // Ensure we have 3 block sizes, one for each id.
+ Value one;
+ for (auto attr : {tX, tY, tZ}) {
+ if (std::find(threadMapping.begin(), threadMapping.end(), attr) ==
+ threadMapping.end()) {
+ threadMapping.push_back(attr);
+ one = one ? one : rewriter.create<arith::ConstantIndexOp>(loc, 1);
+ numThreads.push_back(one);
+ }
+ }
+
+ // Step 2. sort the values by the corresponding GPUThreadMappingAttr.
+ auto comparator = [](Attribute a, Attribute b) -> bool {
+ return static_cast<int64_t>(a.cast<GPUThreadMappingAttr>().getThread()) <
+ static_cast<int64_t>(b.cast<GPUThreadMappingAttr>().getThread());
+ };
+ SmallVector<Value> blockDimValues =
+ scf::ForeachThreadOp::getValuesSortedByKey(threadMapping, numThreads,
+ comparator);
+ SmallVector<int64_t> blockDims =
+ llvm::to_vector(llvm::map_range(blockDimValues, [](Value v) {
+ return v.getDefiningOp<arith::ConstantIndexOp>().value();
}));
- // Step 1. Create the gpu.thread ops
- Location loc = foreachThreadOp.getLoc();
+ // Step 3. Create the gpu.thread ops and map the induction variables to the
+ // newly created ops.
IndexType indexType = rewriter.getIndexType();
-
- SmallVector<Dimension> gpuDims{Dimension::x, Dimension::y, Dimension::z};
- SmallVector<Value> threadOps;
- for (int64_t idx : llvm::seq<int64_t>(0, blockDim.size())) {
- threadOps.push_back(
- rewriter.create<ThreadIdOp>(loc, indexType, gpuDims[idx]));
+ SmallVector<Value> threadOps{
+ rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
+ rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
+ rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
+ BlockAndValueMapping bvm;
+ for (auto [blockIdx, blockDim] :
+ llvm::zip(foreachThreadOp.getThreadIndices(), threadMapping)) {
+ bvm.map(blockIdx, threadOps[static_cast<int64_t>(
+ blockDim.cast<GPUThreadMappingAttr>().getThread())]);
}
- // Step 2. Maybe create conditionals to predicate the region.
+
+ // Step 4. Maybe create conditionals to predicate the region.
Value predicate;
for (auto [threadId, blockDim, globalBlockDim] :
- llvm::zip(threadOps, blockDim, globalBlockDims)) {
+ llvm::zip(threadOps, blockDims, globalBlockDims)) {
if (blockDim > globalBlockDim) {
return failureHelper(
"The requested GPU threads are fewer than the number of loop trip "
@@ -404,19 +458,19 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
: tmpPredicate;
}
- // Step 3. Move the body of foreachThreadOp.
+ // Step 5. Move the body of foreachThreadOp.
// Erase the terminator first, it will not be used.
rewriter.eraseOp(foreachThreadOp.getTerminator());
Block *targetBlock;
Block::iterator insertionPoint;
if (predicate) {
- // Step 3.a. If predicated, move at the beginning.
+ // Step 5.a. If predicated, move at the beginning.
auto ifOp =
rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
targetBlock = ifOp.thenBlock();
insertionPoint = ifOp.thenBlock()->begin();
} else {
- // Step 3.a. Otherwise, move inline just before foreachThreadOp.
+ // Step 5.b. Otherwise, move inline just before foreachThreadOp.
targetBlock = foreachThreadOp->getBlock();
insertionPoint = Block::iterator(foreachThreadOp);
}
@@ -424,25 +478,21 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
targetBlock->getOperations().splice(insertionPoint,
sourceBlock.getOperations());
- // Step 4. RAUW thread indices to thread ops.
- SmallVector<Value> threadIndices =
- *foreachThreadOp.getPermutedThreadIndices(mapping);
- for (auto [threadIdx, threadOp] : llvm::zip(threadIndices, threadOps)) {
- Value val = threadIdx;
- Value op = threadOp;
- if (!val)
- continue;
- for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
- user->replaceUsesOfWith(val, op);
+ // Step 6. RAUW thread indices to thread ops.
+ for (Value threadIdx : foreachThreadOp.getThreadIndices()) {
+ for (Operation *user : llvm::make_early_inc_range(threadIdx.getUsers())) {
+ rewriter.updateRootInPlace(user, [&]() {
+ user->replaceUsesOfWith(threadIdx, bvm.lookup(threadIdx));
+ });
}
}
- // Step 5. syncthreads.
+ // Step 7. syncthreads.
// TODO: Need warpsync
if (syncAfterDistribute)
rewriter.create<BarrierOp>(loc);
- // Step 6. Erase old op.
+ // Step 8. Erase old op.
rewriter.eraseOp(foreachThreadOp);
return DiagnosedSilenceableFailure::success();
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 6a85fb64d3d05..b39edc0305002 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1114,12 +1114,15 @@ LogicalResult ForeachThreadOp::verify() {
if (body->getArgument(i + getRank()).getType() != getOutputs()[i].getType())
return emitOpError("type mismatch between ")
<< i << "-th output and corresponding block argument";
- if (getMapping().has_value())
+ if (getMapping().has_value() && !getMapping()->empty()) {
+ if (static_cast<int64_t>(getMapping()->size()) != getRank())
+ return emitOpError() << "mapping attribute size must match op rank";
for (auto map : getMapping()->getValue()) {
if (!isa<DeviceMappingAttrInterface>(map))
return emitOpError()
<< getMappingAttrName() << " is not device mapping attribute";
}
+ }
return success();
}
@@ -1294,59 +1297,21 @@ PerformConcurrentlyOp ForeachThreadOp::getTerminator() {
return cast<PerformConcurrentlyOp>(getBody()->getTerminator());
}
-template <typename T>
-static FailureOr<SmallVector<T>> permute(const SmallVector<T> &vals,
- ArrayRef<int64_t> perm) {
- if (vals.size() != perm.size())
- return failure();
- SmallVector<T> result(vals.size());
- SmallVector<bool> seen(vals.size());
- for (auto [idx, val] : llvm::zip(perm, vals)) {
- // Already seen, invalid mapping.
- if (seen[idx])
- return failure();
- result[idx] = val;
- seen[idx] = true;
- }
- // Some not seen, invalid mapping.
- if (!llvm::all_of(seen, [](bool b) { return b; }))
- return failure();
- return result;
-}
-
-/// Helper to get apply the `mapping` permutation of a
-/// `foreachThreadOp` to `values`.
-template <typename T>
-static FailureOr<SmallVector<T>>
-getValuesPermutedByThreadMapping(scf::ForeachThreadOp foreachThreadOp,
- const SmallVector<T> &values,
- ArrayRef<int64_t> mapping) {
- // Apply mapping permutation if specified.
- FailureOr<SmallVector<T>> maybePermuted = permute(values, mapping);
- if (failed(maybePermuted))
- return foreachThreadOp->emitError("invalid permutation");
- return *maybePermuted;
- return values;
-}
-
-/// Return the thread indices in the order specified by the mapping
-/// attribute. Return failure is mapping is not a valid permutation.
-FailureOr<SmallVector<Value>>
-ForeachThreadOp::getPermutedThreadIndices(ArrayRef<int64_t> mapping) {
- SmallVector<Value> threadCountValues = this->getThreadIndices();
- threadCountValues.resize(3, Value());
- return getValuesPermutedByThreadMapping(*this, threadCountValues, mapping);
-}
-
-/// Return the number of threads in the order specified by the
-/// mapping attribute.
-/// Return failure is mapping is not a valid permutation.
-FailureOr<SmallVector<OpFoldResult>>
-ForeachThreadOp::getPermutedNumThreads(OpBuilder &b,
- ArrayRef<int64_t> mapping) {
- SmallVector<OpFoldResult> threadCountValues = this->getNumThreads();
- threadCountValues.resize(3, b.getIndexAttr(1));
- return getValuesPermutedByThreadMapping(*this, threadCountValues, mapping);
+/// Helper to sort `values` according to matching `keys`.
+SmallVector<Value> ForeachThreadOp::getValuesSortedByKey(
+ ArrayRef<Attribute> keys, ValueRange values,
+ llvm::function_ref<bool(Attribute, Attribute)> compare) {
+ if (keys.empty())
+ return values;
+ assert(keys.size() == values.size() && "unexpected mismatching sizes");
+ auto indices = llvm::to_vector(llvm::seq<int64_t>(0, values.size()));
+ std::sort(indices.begin(), indices.end(),
+ [&](int64_t i, int64_t j) { return compare(keys[i], keys[j]); });
+ SmallVector<Value> res;
+ res.reserve(values.size());
+ for (int64_t i = 0, e = indices.size(); i < e; ++i)
+ res.push_back(values[indices[i]]);
+ return res;
}
ForeachThreadOp mlir::scf::getForeachThreadOpThreadIndexOwner(Value val) {
diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
index de50c59bae6be..c45d9c058dd06 100644
--- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -67,7 +67,7 @@ func.func @map_nested_foreach_to_threads_fewer_threads(%x: memref<2 x 32 x f32>,
%5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
- } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>] }
+ } { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
gpu.terminator
}
@@ -79,7 +79,7 @@ func.func @map_nested_foreach_to_threads_fewer_threads(%x: memref<2 x 32 x f32>,
%5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
- } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>] }
+ } { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
gpu.terminator
}
@@ -106,7 +106,7 @@ func.func @map_nested_foreach_to_threads_dynamic_trip_count(%x: memref<2 x 32 x
%5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
- } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>] }
+ } { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
gpu.terminator
}
return %y : memref<2 x 32 x f32>
@@ -131,7 +131,7 @@ func.func @map_nested_foreach_to_threads_4d_loop(%x: memref<2x32x32x32xf32>, %y:
scf.foreach_thread (%i, %j, %k, %l) in (%c2, %c32,%c32,%c32) {
%4 = memref.load %x[%i, %j, %k, %l] : memref<2x32x32x32xf32>
memref.store %4, %y[%i, %j, %k, %l] : memref<2x32x32x32xf32>
- } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>] }
+ } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>, #gpu.thread<z>] }
gpu.terminator
}
return %y : memref<2x32x32x32xf32>
@@ -197,14 +197,14 @@ func.func @map_foreach_to_blocks_not_unique(%x: memref<2 x 32 x f32>, %y: memref
%5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
- } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>] }
+ } { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
scf.foreach_thread (%i, %j) in (%c7, %c9) {
%4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
%5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
- } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>] }
+ } { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
gpu.terminator
}
@@ -232,14 +232,14 @@ func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref
%5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
- } { mapping = [#gpu.thread<x>, #gpu.thread<y>, #gpu.thread<z>] }
+ } { mapping = [#gpu.thread<x>, #gpu.thread<y>] }
scf.foreach_thread (%i, %j) in (%c7, %c9) {
%4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
%5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
- } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>] }
+ } { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
return %y : memref<2 x 32 x f32>
}
@@ -261,7 +261,7 @@ func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref
%5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
- } { mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>] }
+ } { mapping = [#gpu.block<x>, #gpu.block<y>] }
return %y : memref<2 x 32 x f32>
}
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index eb7208bfc41a2..a5e13032d9956 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -24,7 +24,7 @@ func.func @saxpy2dblock(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream
%5 = memref.load %y[%i, %j] : !type
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : !type
- } { mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>]}
+ } { mapping = [#gpu.block<x>, #gpu.block<y>]}
gpu.terminator
}
return %y : !type
@@ -73,12 +73,12 @@ func.func @saxpy2d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !g
%5 = memref.load %y[%i, %j] : !type
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : !type
- } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>]}
+ } { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.foreach_thread (%i) in (%c12) {
%7 = memref.load %t[%i] : !type1d
%8 = arith.addf %alpha, %7 : f32
memref.store %8, %t[%i] : !type1d
- } {mapping = [#gpu.thread<x>, #gpu.thread<y>, #gpu.thread<z>] }
+ } {mapping = [#gpu.thread<x>] }
gpu.terminator
}
return %y : !type
@@ -118,8 +118,8 @@ func.func @saxpy4d(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d {
%5 = memref.load %y[%i, %j, %k, %l] : !type4d
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j, %k, %l] : !type4d
- } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>] }
- } { mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>] }
+ } { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
+ } { mapping = [#gpu.block<x>, #gpu.block<y>] }
return %y : !type4d
}
@@ -151,7 +151,7 @@ func.func @saxpy2d_no_barrier(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %
%5 = memref.load %y[%i, %j] : !type
%6 = math.fma %alpha, %4, %5 : f32
memref.store %6, %y[%i, %j] : !type
- } { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>] }
+ } { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
gpu.terminator
}
return %y : !type
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index fa91ba03a0aac..5fbb0f735d272 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -575,6 +575,21 @@ func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
// -----
+func.func @mismatched_mapping(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
+ %one = arith.constant 1 : index
+ %c65535 = arith.constant 65535 : index
+ // expected-error @below {{'scf.foreach_thread' op mapping attribute size must match op rank}}
+ scf.foreach_thread (%i, %j) in (%c65535, %c65535) {
+ %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+ %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+ %6 = math.fma %alpha, %4, %5 : f32
+ memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+ } { mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>] }
+ return %y : memref<2 x 32 x f32>
+}
+
+// -----
+
func.func @switch_wrong_case_count(%arg0: index) {
// expected-error @below {{'scf.index_switch' op has 0 case regions but 1 case values}}
"scf.index_switch"(%arg0) ({
More information about the Mlir-commits
mailing list