[Mlir-commits] [mlir] 90ecfa2 - [mlir][linalg] NFC - Move some utils in preparation for revamping mapping of scf.forall
Nicolas Vasilache
llvmlistbot at llvm.org
Mon Jul 24 16:20:06 PDT 2023
Author: Nicolas Vasilache
Date: 2023-07-25T01:19:57+02:00
New Revision: 90ecfa2a408fd5a4486cf0f4a66dead6c586a1d7
URL: https://github.com/llvm/llvm-project/commit/90ecfa2a408fd5a4486cf0f4a66dead6c586a1d7
DIFF: https://github.com/llvm/llvm-project/commit/90ecfa2a408fd5a4486cf0f4a66dead6c586a1d7.diff
LOG: [mlir][linalg] NFC - Move some utils in preparation for revamping mapping of scf.forall
Added:
mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
Modified:
mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
mlir/lib/Dialect/Utils/StaticValueUtils.cpp
utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
index 3b10fcb77aaf39..a1cfa406c60ceb 100644
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
@@ -31,51 +31,7 @@ namespace mlir {
class DialectRegistry;
namespace transform {
namespace gpu {
-
-/// Helper type for functions that generate ids for the mapping of a
-/// scf.forall.
-struct IdBuilderResult {
- // Ops used to replace the forall induction variables.
- SmallVector<Value> mappingIdOps;
- // Actual mapping sizes used to predicate the forall body when they are
- // smaller than the available mapping sizes.
- SmallVector<int64_t> predicateMappingSizes;
- // Ops used to predicate the forall body when predicateMappingSizes is smaller
- // than the available mapping sizes.
- SmallVector<Value> predicateIdOps;
-};
-
-/// Common gpu id builder type, allows the configuration of lowering for various
-/// mapping schemes. Takes:
-/// - A rewriter with insertion point set before the forall op to rewrite.
-/// - The loc of the forall op to rewrite.
-/// - A list of positive integers carrying the mapping sizes for the current
-/// forall op to rewrite.
-using GpuIdBuilderFnType =
- std::function<IdBuilderResult(RewriterBase &, Location, ArrayRef<int64_t>)>;
-
-/// Helper struct for configuring the rewrite of mapped scf.forall ops to
-/// various gpu id configurations.
-struct GpuIdBuilder {
- GpuIdBuilder(ArrayRef<OpFoldResult> blockDims, ArrayRef<int64_t> mappingSizes)
- : blockDimsOfr(blockDims), availableMappingSizes(mappingSizes),
- mappingAttributes(), idBuilder() {}
-
- /// List of OpFoldResult carrying the multi-dimensional number of
- /// threads available in the current kernel (i.e. the current blockDims in
- /// CUDA parlance).
- ArrayRef<OpFoldResult> blockDimsOfr;
-
- /// A list of positive integers carrying the number of available mapping
- /// resources that can trigger predication,
- ArrayRef<int64_t> availableMappingSizes;
-
- /// The mapping attributes targeted by this generator.
- SmallVector<DeviceMappingAttrInterface> mappingAttributes;
-
- /// The constructor that builds the concrete IR for mapping ids.
- GpuIdBuilderFnType idBuilder;
-};
+struct GpuIdBuilder;
/// Map the top level `scf.forall` op to GPU Thread Blocks.
/// Mapping is one-to-one and the induction variables of `scf.forall` are
@@ -121,11 +77,6 @@ DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(
Operation *target, ArrayRef<int64_t> blockDimsOfr,
ArrayRef<int64_t> warpDims, bool syncAfterDistribute);
-/// Find the unique top level scf::ForallOp within a given target op.
-DiagnosedSilenceableFailure
-findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
- TransformOpInterface transformOp);
-
} // namespace gpu
} // namespace transform
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
new file mode 100644
index 00000000000000..ac10f5c5008eff
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
@@ -0,0 +1,157 @@
+//===- Utils.h - Utils for GPU transform ops --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H
+#define MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+namespace gpu {
+class GPUOp;
+class LaunchOp;
+enum class MappingId : uint64_t;
+} // namespace gpu
+namespace scf {
+class ForallOp;
+} // namespace scf
+namespace transform {
+namespace gpu {
+
+/// Helper type for functions that generate ids for the mapping of a
+/// scf.forall.
+struct IdBuilderResult {
+ // Ops used to replace the forall induction variables.
+ SmallVector<Value> mappingIdOps;
+ // Actual mapping sizes used to predicate the forall body when they are
+ // smaller than the available mapping sizes.
+ SmallVector<int64_t> predicateMappingSizes;
+ // Ops used to predicate the forall body when predicateMappingSizes is smaller
+ // than the available mapping sizes.
+ SmallVector<Value> predicateIdOps;
+};
+
+/// Common gpu id builder type, allows the configuration of lowering for various
+/// mapping schemes. Takes:
+/// - A rewriter with insertion point set before the forall op to rewrite.
+/// - The loc of the forall op to rewrite.
+/// - A list of positive integers carrying the mapping sizes for the current
+/// forall op to rewrite.
+using GpuIdBuilderFnType =
+ std::function<IdBuilderResult(RewriterBase &, Location, ArrayRef<int64_t>)>;
+
+/// Helper struct for configuring the rewrite of mapped scf.forall ops to
+/// various gpu id configurations.
+struct GpuIdBuilder {
+ GpuIdBuilder(ArrayRef<OpFoldResult> blockDims, ArrayRef<int64_t> mappingSizes)
+ : blockDimsOfr(blockDims), availableMappingSizes(mappingSizes),
+ mappingAttributes(), idBuilder() {}
+
+ /// List of OpFoldResult carrying the multi-dimensional number of
+ /// threads available in the current kernel (i.e. the current blockDims in
+ /// CUDA parlance).
+ ArrayRef<OpFoldResult> blockDimsOfr;
+
+ /// A list of positive integers carrying the number of available mapping
+ /// resources that can trigger predication,
+ ArrayRef<int64_t> availableMappingSizes;
+
+ /// The mapping attributes targeted by this generator.
+ SmallVector<DeviceMappingAttrInterface> mappingAttributes;
+
+ /// The constructor that builds the concrete IR for mapping ids.
+ GpuIdBuilderFnType idBuilder;
+};
+
+/// Builder for gpu::BlockIdOps used in mapping scf.forall to blocks.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 3-D sizes for predicate generation.
+struct GpuBlockIdBuilder : public GpuIdBuilder {
+ GpuBlockIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+ ArrayRef<int64_t> mappingSizes);
+};
+
+/// Builder for gpu::ThreadIdOp used in mapping scf.forall to thread ids without
+/// any reindexing.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 3-D sizes for predicate generation.
+struct GpuThreadIdBuilder : public GpuIdBuilder {
+ GpuThreadIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+ ArrayRef<int64_t> mappingSizes);
+};
+
+/// Builder for warp ids used in mapping scf.forall to warps.
+/// This builder requires a specification of the number of warps along each
+/// dimension to more finely control mapping to warps as well a predication than
+/// by solely analyzing the IR.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 3-D sizes for predicate generation.
+struct GpuWarpIdBuilder : public GpuIdBuilder {
+ GpuWarpIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+ ArrayRef<int64_t> mappingSizes);
+ /// Static specification of the warp size.
+ /// In the future this may be configured by the transformation.
+ static constexpr int64_t kWarpSize = 32;
+};
+
+/// Builder for linear ids used in mapping scf.forall to reindexed threads.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 1-D sizes for predicate generation.
+struct GpuLinearIdBuilder : public GpuIdBuilder {
+ GpuLinearIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+ ArrayRef<int64_t> mappingSizes);
+};
+
+/// Determine if the size of the kernel configuration is supported by the
+/// GPU architecture being used.
+/// TODO this is currently hardwired to CUDA, parameterize and generalize.
+DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
+ std::optional<int64_t> gridDimX,
+ std::optional<int64_t> gridDimY,
+ std::optional<int64_t> gridDimZ,
+ std::optional<int64_t> blockDimX,
+ std::optional<int64_t> blockDimY,
+ std::optional<int64_t> blockDimZ);
+
+/// Create an empty-body gpu::LaunchOp using the provided kernel settings
+/// and put a terminator within.
+DiagnosedSilenceableFailure
+createGpuLaunch(RewriterBase &rewriter, Location loc,
+ TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp,
+ std::optional<int64_t> gridDimX = std::nullopt,
+ std::optional<int64_t> gridDimY = std::nullopt,
+ std::optional<int64_t> gridDimZ = std::nullopt,
+ std::optional<int64_t> blockDimX = std::nullopt,
+ std::optional<int64_t> blockDimY = std::nullopt,
+ std::optional<int64_t> blockDimZ = std::nullopt);
+
+/// Alter kernel configuration of the given kernel.
+DiagnosedSilenceableFailure
+alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch,
+ TransformOpInterface transformOp,
+ std::optional<int64_t> gridDimX = std::nullopt,
+ std::optional<int64_t> gridDimY = std::nullopt,
+ std::optional<int64_t> gridDimZ = std::nullopt,
+ std::optional<int64_t> blockDimX = std::nullopt,
+ std::optional<int64_t> blockDimY = std::nullopt,
+ std::optional<int64_t> blockDimZ = std::nullopt);
+
+/// Find the unique top level scf::ForallOp within a given target op.
+DiagnosedSilenceableFailure
+findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
+ TransformOpInterface transformOp);
+
+} // namespace gpu
+} // namespace transform
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H
diff --git a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
index 8c9b5e567f6699..b8f7a26ab6a178 100644
--- a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
@@ -84,6 +84,9 @@ SmallVector<OpFoldResult> getAsIndexOpFoldResult(MLIRContext *ctx,
/// If ofr is a constant integer or an IntegerAttr, return the integer.
std::optional<int64_t> getConstantIntValue(OpFoldResult ofr);
+/// If all ifs are constant integers or IntegerAttrs, return the integers.
+std::optional<SmallVector<int64_t>>
+getConstantIntValues(ArrayRef<OpFoldResult> ofrs);
/// Return true if `ofr` is constant integer equal to `value`.
bool isConstantIntValue(OpFoldResult ofr, int64_t value);
diff --git a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
index a2c07fa213ffd9..73b77343ef5073 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
@@ -1,5 +1,6 @@
add_mlir_dialect_library(MLIRGPUTransformOps
GPUTransformOps.cpp
+ Utils.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/TransformOps
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index 9182ccd96e84a8..07470c24ae2d69 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -13,6 +13,7 @@
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
+#include "mlir/Dialect/GPU/TransformOps/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
@@ -622,204 +623,6 @@ void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {
// Block and thread mapping utilities.
//===----------------------------------------------------------------------===//
-namespace {
-
-/// Return a flattened thread id for the workgroup with given sizes.
-static Value buildLinearThreadId(RewriterBase &rewriter, Location loc,
- ArrayRef<OpFoldResult> blockDimsOfr) {
- LLVM_DEBUG(llvm::interleaveComma(
- blockDimsOfr,
- DBGS() << "----buildLinearThreadId with blockDimsOfr: ");
- llvm::dbgs() << "\n");
- assert(blockDimsOfr.size() == 3 && "expected 3 workgroup sizes");
- AffineExpr tx, ty, tz, BDX, BDY;
- bindDims(rewriter.getContext(), tx, ty, tz);
- bindSymbols(rewriter.getContext(), BDX, BDY);
- IndexType indexType = rewriter.getIndexType();
- SmallVector<OpFoldResult> threadsAndWorkGroups{
- rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x).getResult(),
- rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y).getResult(),
- rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z).getResult()};
- threadsAndWorkGroups.push_back(blockDimsOfr[0]);
- threadsAndWorkGroups.push_back(blockDimsOfr[1]);
- OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
- rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups);
- return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
-}
-
-/// Builder for gpu::BlockIdOps used in mapping scf.forall to blocks.
-/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
-/// as 3-D sizes for predicate generation.
-struct GpuBlockIdBuilder : public GpuIdBuilder {
-
- GpuBlockIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
- ArrayRef<int64_t> mappingSizes)
- : GpuIdBuilder(blockDims, mappingSizes) {
- mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX),
- GPUBlockMappingAttr::get(ctx, Blocks::DimY),
- GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
- idBuilder = [](RewriterBase &rewriter, Location loc,
- ArrayRef<int64_t> forallMappingSizes) {
- IndexType indexType = rewriter.getIndexType();
- SmallVector<Value> ids{
- rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
- rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
- rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)};
- // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
- // predicate generation.
- return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
- ids};
- };
- }
-};
-
-/// Builder for gpu::ThreadIdOp used in mapping scf.forall to thread ids without
-/// any reindexing.
-/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
-/// as 3-D sizes for predicate generation.
-struct GpuThreadIdBuilder : public GpuIdBuilder {
- GpuThreadIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
- ArrayRef<int64_t> mappingSizes)
- : GpuIdBuilder(blockDims, mappingSizes) {
- mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX),
- GPUThreadMappingAttr::get(ctx, Threads::DimY),
- GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
- idBuilder = [](RewriterBase &rewriter, Location loc,
- ArrayRef<int64_t> forallMappingSizes) {
- IndexType indexType = rewriter.getIndexType();
- SmallVector<Value> ids{
- rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
- rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
- rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
- // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
- // predicate generation.
- return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
- ids};
- };
- }
-};
-
-/// Builder for warp ids used in mapping scf.forall to warps.
-/// This builder requires a specification of the number of warps along each
-/// dimension to more finely control mapping to warps as well a predication than
-/// by solely analyzing the IR.
-/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
-/// as 3-D sizes for predicate generation.
-struct GpuWarpIdBuilder : public GpuIdBuilder {
- GpuWarpIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
- ArrayRef<int64_t> mappingSizes)
- : GpuIdBuilder(blockDims, mappingSizes) {
- mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX),
- GPUWarpMappingAttr::get(ctx, Warps::DimY),
- GPUWarpMappingAttr::get(ctx, Warps::DimZ)};
- idBuilder = [this](RewriterBase &rewriter, Location loc,
- ArrayRef<int64_t> forallMappingSizes) {
- // Build the linear warp id and decompose it in the basis of
- // `forallMappingSizes`.
- Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
- AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
- OpFoldResult warpIdOfr = affine::makeComposedFoldedAffineApply(
- rewriter, loc, d0.floorDiv(kWarpSize), {linearId});
- Value warpId = getValueOrCreateConstantIndexOp(rewriter, loc, warpIdOfr);
- // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
- // "row-major" order.
- SmallVector<int64_t> reverseBasisSizes(
- llvm::reverse(this->availableMappingSizes));
- SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
- SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
- SmallVector<Value> ids;
- // Reverse back to be in [x, y, z] order.
- for (AffineExpr e : llvm::reverse(delinearizingExprs))
- ids.push_back(
- affine::makeComposedAffineApply(rewriter, loc, e, {warpId}));
-
- // clang-format off
- LDBG("----linearId: " << linearId);
- LDBG("----warpId: " << warpId);
- LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
- DBGS() << "--delinearization basis: ");
- llvm::dbgs() << "\n";
- llvm::interleaveComma(strides,
- DBGS() << "--delinearization strides: ");
- llvm::dbgs() << "\n";
- llvm::interleaveComma(delinearizingExprs,
- DBGS() << "--delinearization exprs: ");
- llvm::dbgs() << "\n";
- llvm::interleaveComma(ids, DBGS() << "--ids: ");
- llvm::dbgs() << "\n";);
- // clang-format on
-
- // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
- // predicate generation.
- return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
- ids};
- };
- }
-
- /// Static specification of the warp size.
- /// In the future this may be configured by the transformation.
- static constexpr int64_t kWarpSize = 32;
-};
-
-/// Builder for linear ids used in mapping scf.forall to reindexed threads.
-/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
-/// as 1-D sizes for predicate generation.
-struct GpuLinearIdBuilder : public GpuIdBuilder {
- GpuLinearIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
- ArrayRef<int64_t> mappingSizes)
- : GpuIdBuilder(blockDims, mappingSizes) {
- mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX),
- GPULinearIdMappingAttr::get(ctx, LinearId::DimY),
- GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)};
- idBuilder = [this](RewriterBase &rewriter, Location loc,
- ArrayRef<int64_t> forallMappingSizes) {
- // Build the linear thread id and decompose it in the basis of
- // `forallMappingSizes`.
- Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
- // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
- // "row-major" order.
- SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
- SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
- AffineExpr d0;
- bindDims(rewriter.getContext(), d0);
- SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
- SmallVector<Value> ids;
- // Reverse back to be in [x, y, z] order.
- for (AffineExpr e : llvm::reverse(delinearizingExprs))
- ids.push_back(
- affine::makeComposedAffineApply(rewriter, loc, e, {linearId}));
-
- // clang-format off
- LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
- DBGS() << "--delinearization basis: ");
- llvm::dbgs() << "\n";
- llvm::interleaveComma(strides,
- DBGS() << "--delinearization strides: ");
- llvm::dbgs() << "\n";
- llvm::interleaveComma(delinearizingExprs,
- DBGS() << "--delinearization exprs: ");
- llvm::dbgs() << "\n";
- llvm::interleaveComma(ids, DBGS() << "--ids: ");
- llvm::dbgs() << "\n";);
- // clang-format on
-
- // Compute and return the 1-D actual mapping size spanned by the linearId,
- // it will be used to predicate against the linearized total number of
- // threads.
- int64_t actualMappingSize = 1;
- for (int64_t s : forallMappingSizes)
- actualMappingSize *= s;
-
- // Return 3-D ids for indexing rewrite and 1-D size and id for
- // predicate generation.
- return IdBuilderResult{ids, SmallVector<int64_t>{actualMappingSize},
- SmallVector<Value>{linearId}};
- };
- }
-};
-
-} // namespace
-
static DiagnosedSilenceableFailure
definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
Operation *target, const Twine &message) {
@@ -905,121 +708,6 @@ verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
return DiagnosedSilenceableFailure::success();
}
-/// Determines if the size of the kernel configuration is supported by the
-/// GPU architecture being used. It presently makes use of CUDA limitations,
-/// however that aspect may be enhanced for other GPUs.
-static DiagnosedSilenceableFailure checkGpuLimits(
- TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
- std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
- std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
- std::optional<int64_t> blockDimZ) {
-
- static constexpr int maxTotalBlockdim = 1024;
- static constexpr int maxBlockdimx = 1024;
- static constexpr int maxBlockdimy = 1024;
- static constexpr int maxBlockdimz = 64;
- static constexpr int maxTotalGriddim = 2147483647;
- static constexpr int maxGriddimx = 2147483647;
- static constexpr int maxGriddimy = 65535;
- static constexpr int maxGriddimz = 65535;
-
- if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
- maxTotalBlockdim ||
- (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
- maxTotalGriddim ||
- blockDimX.value_or(1) > maxBlockdimx ||
- blockDimY.value_or(1) > maxBlockdimy ||
- blockDimZ.value_or(1) > maxBlockdimz ||
- gridDimY.value_or(1) > maxGriddimy ||
- gridDimZ.value_or(1) > maxGriddimz ||
- gridDimX.value_or(1) > maxGriddimx) {
- return transformOp.emitSilenceableError()
- << "Trying to launch a GPU kernel with grid_dims = ("
- << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
- << gridDimZ.value_or(1) << ") block_dims = ("
- << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
- << blockDimZ.value_or(1) << "). It is larger than the limits.";
- }
- return DiagnosedSilenceableFailure::success();
-}
-
-/// Creates an empty-body gpu::LaunchOp using the provided kernel settings
-/// and put a terminator within.
-static DiagnosedSilenceableFailure
-createGpuLaunch(RewriterBase &rewriter, Location loc,
- TransformOpInterface transformOp, LaunchOp &launchOp,
- std::optional<int64_t> gridDimX = std::nullopt,
- std::optional<int64_t> gridDimY = std::nullopt,
- std::optional<int64_t> gridDimZ = std::nullopt,
- std::optional<int64_t> blockDimX = std::nullopt,
- std::optional<int64_t> blockDimY = std::nullopt,
- std::optional<int64_t> blockDimZ = std::nullopt) {
- DiagnosedSilenceableFailure diag =
- checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
- blockDimY, blockDimZ);
- if (!diag.succeeded())
- return diag;
-
- auto createConst = [&](int dim) {
- return rewriter.create<arith::ConstantIndexOp>(loc, dim);
- };
- OpBuilder::InsertionGuard guard(rewriter);
- Value one = createConst(1);
- Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
- Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
- Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
- Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
- Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
- Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
- launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
- blkSizeX, blkSizeY, blkSizeZ);
- rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
- rewriter.create<TerminatorOp>(loc);
- return DiagnosedSilenceableFailure::success();
-}
-
-/// Alter kernel configuration of the given kernel.
-static DiagnosedSilenceableFailure
-alterGpuLaunch(RewriterBase &rewriter, LaunchOp gpuLaunch,
- TransformOpInterface transformOp,
- std::optional<int64_t> gridDimX = std::nullopt,
- std::optional<int64_t> gridDimY = std::nullopt,
- std::optional<int64_t> gridDimZ = std::nullopt,
- std::optional<int64_t> blockDimX = std::nullopt,
- std::optional<int64_t> blockDimY = std::nullopt,
- std::optional<int64_t> blockDimZ = std::nullopt) {
- DiagnosedSilenceableFailure diag =
- checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
- blockDimY, blockDimZ);
- if (!diag.succeeded())
- return diag;
-
- KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
- OpBuilder::InsertionGuard guard(rewriter);
- rewriter.setInsertionPointAfterValue(currentBlockdim.x);
- auto createConstValue = [&](int dim) {
- return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
- dim);
- };
-
- if (gridDimX.has_value())
- gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
- if (gridDimY.has_value())
- gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
- if (gridDimZ.has_value())
- gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
- if (blockDimX.has_value())
- gpuLaunch.getBlockSizeXMutable().assign(
- createConstValue(blockDimX.value()));
- if (blockDimY.has_value())
- gpuLaunch.getBlockSizeYMutable().assign(
- createConstValue(blockDimY.value()));
- if (blockDimZ.has_value())
- gpuLaunch.getBlockSizeZMutable().assign(
- createConstValue(blockDimZ.value()));
- return DiagnosedSilenceableFailure::success();
-}
-
/// Struct to return the result of the rewrite of a forall operation.
struct ForallRewriteResult {
SmallVector<int64_t> mappingSizes;
@@ -1223,26 +911,6 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
return DiagnosedSilenceableFailure::success();
}
-DiagnosedSilenceableFailure
-mlir::transform::gpu::findTopLevelForallOp(Operation *target,
- scf::ForallOp &topLevelForallOp,
- TransformOpInterface transformOp) {
- auto walkResult = target->walk([&](scf::ForallOp forallOp) {
- if (forallOp->getParentOfType<scf::ForallOp>())
- return WalkResult::advance();
- if (topLevelForallOp)
- // TODO: Handle multiple forall if they are independent.
- return WalkResult::interrupt();
- topLevelForallOp = forallOp;
- return WalkResult::advance();
- });
-
- if (walkResult.wasInterrupted())
- return transformOp.emitSilenceableError()
- << "could not find a unique topLevel scf.forall";
- return DiagnosedSilenceableFailure::success();
-}
-
DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
transform::TransformRewriter &rewriter, Operation *target,
ApplyToEachResultList &results, transform::TransformState &state) {
diff --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
new file mode 100644
index 00000000000000..3ba9bf1a5a14a4
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
@@ -0,0 +1,346 @@
+//===- Utils.cpp - Utils for GPU transform ops ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/TransformOps/Utils.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Debug.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+using namespace mlir::transform;
+using namespace mlir::transform::gpu;
+
+#define DEBUG_TYPE "gpu-transforms"
+
+#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+#define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
+
+/// Return a flattened thread id for the workgroup with given sizes.
+static Value buildLinearThreadId(RewriterBase &rewriter, Location loc,
+ ArrayRef<OpFoldResult> blockDimsOfr) {
+ LLVM_DEBUG(llvm::interleaveComma(
+ blockDimsOfr,
+ DBGS() << "----buildLinearThreadId with blockDimsOfr: ");
+ llvm::dbgs() << "\n");
+ assert(blockDimsOfr.size() == 3 && "expected 3 workgroup sizes");
+ AffineExpr tx, ty, tz, BDX, BDY;
+ bindDims(rewriter.getContext(), tx, ty, tz);
+ bindSymbols(rewriter.getContext(), BDX, BDY);
+ IndexType indexType = rewriter.getIndexType();
+ SmallVector<OpFoldResult> threadsAndWorkGroups{
+ rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x).getResult(),
+ rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y).getResult(),
+ rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z).getResult()};
+ threadsAndWorkGroups.push_back(blockDimsOfr[0]);
+ threadsAndWorkGroups.push_back(blockDimsOfr[1]);
+ OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
+ rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups);
+ return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
+}
+
+namespace mlir {
+namespace transform {
+namespace gpu {
+
+GpuBlockIdBuilder::GpuBlockIdBuilder(MLIRContext *ctx,
+ ArrayRef<OpFoldResult> blockDims,
+ ArrayRef<int64_t> mappingSizes)
+ : GpuIdBuilder(blockDims, mappingSizes) {
+ mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX),
+ GPUBlockMappingAttr::get(ctx, Blocks::DimY),
+ GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
+ idBuilder = [](RewriterBase &rewriter, Location loc,
+ ArrayRef<int64_t> forallMappingSizes) {
+ IndexType indexType = rewriter.getIndexType();
+ SmallVector<Value> ids{
+ rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
+ rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
+ rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)};
+ // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
+ // predicate generation.
+ return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes}, ids};
+ };
+}
+
+GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx,
+ ArrayRef<OpFoldResult> blockDims,
+ ArrayRef<int64_t> mappingSizes)
+ : GpuIdBuilder(blockDims, mappingSizes) {
+ mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX),
+ GPUThreadMappingAttr::get(ctx, Threads::DimY),
+ GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
+ idBuilder = [](RewriterBase &rewriter, Location loc,
+ ArrayRef<int64_t> forallMappingSizes) {
+ IndexType indexType = rewriter.getIndexType();
+ SmallVector<Value> ids{
+ rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
+ rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
+ rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
+ // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
+ // predicate generation.
+ return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes}, ids};
+ };
+}
+
+GpuWarpIdBuilder::GpuWarpIdBuilder(MLIRContext *ctx,
+ ArrayRef<OpFoldResult> blockDims,
+ ArrayRef<int64_t> mappingSizes)
+ : GpuIdBuilder(blockDims, mappingSizes) {
+ mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX),
+ GPUWarpMappingAttr::get(ctx, Warps::DimY),
+ GPUWarpMappingAttr::get(ctx, Warps::DimZ)};
+ idBuilder = [this](RewriterBase &rewriter, Location loc,
+ ArrayRef<int64_t> forallMappingSizes) {
+ // Build the linear warp id and decompose it in the basis of
+ // `forallMappingSizes`.
+ Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
+ AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
+ OpFoldResult warpIdOfr = affine::makeComposedFoldedAffineApply(
+ rewriter, loc, d0.floorDiv(kWarpSize), {linearId});
+ Value warpId = getValueOrCreateConstantIndexOp(rewriter, loc, warpIdOfr);
+ // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
+ // "row-major" order.
+ SmallVector<int64_t> reverseBasisSizes(
+ llvm::reverse(this->availableMappingSizes));
+ SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
+ SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+ SmallVector<Value> ids;
+ // Reverse back to be in [x, y, z] order.
+ for (AffineExpr e : llvm::reverse(delinearizingExprs))
+ ids.push_back(
+ affine::makeComposedAffineApply(rewriter, loc, e, {warpId}));
+
+ // clang-format off
+ LDBG("----linearId: " << linearId);
+ LDBG("----warpId: " << warpId);
+ LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
+ DBGS() << "--delinearization basis: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(strides,
+ DBGS() << "--delinearization strides: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(delinearizingExprs,
+ DBGS() << "--delinearization exprs: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(ids, DBGS() << "--ids: ");
+ llvm::dbgs() << "\n";);
+ // clang-format on
+
+ // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
+ // predicate generation.
+ return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes}, ids};
+ };
+}
+
+GpuLinearIdBuilder::GpuLinearIdBuilder(MLIRContext *ctx,
+ ArrayRef<OpFoldResult> blockDims,
+ ArrayRef<int64_t> mappingSizes)
+ : GpuIdBuilder(blockDims, mappingSizes) {
+ mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX),
+ GPULinearIdMappingAttr::get(ctx, LinearId::DimY),
+ GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)};
+ idBuilder = [this](RewriterBase &rewriter, Location loc,
+ ArrayRef<int64_t> forallMappingSizes) {
+ // Build the linear thread id and decompose it in the basis of
+ // `forallMappingSizes`.
+ Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
+ // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
+ // "row-major" order.
+ SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
+ SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
+ AffineExpr d0;
+ bindDims(rewriter.getContext(), d0);
+ SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+ SmallVector<Value> ids;
+ // Reverse back to be in [x, y, z] order.
+ for (AffineExpr e : llvm::reverse(delinearizingExprs))
+ ids.push_back(
+ affine::makeComposedAffineApply(rewriter, loc, e, {linearId}));
+
+ // clang-format off
+ LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
+ DBGS() << "--delinearization basis: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(strides,
+ DBGS() << "--delinearization strides: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(delinearizingExprs,
+ DBGS() << "--delinearization exprs: ");
+ llvm::dbgs() << "\n";
+ llvm::interleaveComma(ids, DBGS() << "--ids: ");
+ llvm::dbgs() << "\n";);
+ // clang-format on
+
+ // Compute and return the 1-D actual mapping size spanned by the linearId,
+ // it will be used to predicate against the linearized total number of
+ // threads.
+ int64_t actualMappingSize = 1;
+ for (int64_t s : forallMappingSizes)
+ actualMappingSize *= s;
+
+ // Return 3-D ids for indexing rewrite and 1-D size and id for
+ // predicate generation.
+ return IdBuilderResult{ids, SmallVector<int64_t>{actualMappingSize},
+ SmallVector<Value>{linearId}};
+ };
+}
+
+DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
+ std::optional<int64_t> gridDimX,
+ std::optional<int64_t> gridDimY,
+ std::optional<int64_t> gridDimZ,
+ std::optional<int64_t> blockDimX,
+ std::optional<int64_t> blockDimY,
+ std::optional<int64_t> blockDimZ) {
+
+ // TODO: pass a configuration object to set the limits properly.
+ static constexpr int maxTotalBlockdim = 1024;
+ static constexpr int maxBlockdimx = 1024;
+ static constexpr int maxBlockdimy = 1024;
+ static constexpr int maxBlockdimz = 64;
+ static constexpr int maxTotalGriddim = 2147483647;
+ static constexpr int maxGriddimx = 2147483647;
+ static constexpr int maxGriddimy = 65535;
+ static constexpr int maxGriddimz = 65535;
+
+ if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
+ maxTotalBlockdim ||
+ (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
+ maxTotalGriddim ||
+ blockDimX.value_or(1) > maxBlockdimx ||
+ blockDimY.value_or(1) > maxBlockdimy ||
+ blockDimZ.value_or(1) > maxBlockdimz ||
+ gridDimY.value_or(1) > maxGriddimy ||
+ gridDimZ.value_or(1) > maxGriddimz ||
+ gridDimX.value_or(1) > maxGriddimx) {
+ return transformOp.emitSilenceableError()
+ << "Trying to launch a GPU kernel with grid_dims = ("
+ << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
+ << gridDimZ.value_or(1) << ") block_dims = ("
+ << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
+ << blockDimZ.value_or(1) << "). It is larger than the limits.";
+ }
+ return DiagnosedSilenceableFailure::success();
+}
+
+DiagnosedSilenceableFailure createGpuLaunch(
+ RewriterBase &rewriter, Location loc, TransformOpInterface transformOp,
+ LaunchOp &launchOp, std::optional<int64_t> gridDimX,
+ std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
+ std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
+ std::optional<int64_t> blockDimZ) {
+ DiagnosedSilenceableFailure diag =
+ checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
+ blockDimY, blockDimZ);
+ if (!diag.succeeded())
+ return diag;
+
+ auto createConst = [&](int dim) {
+ return rewriter.create<arith::ConstantIndexOp>(loc, dim);
+ };
+ OpBuilder::InsertionGuard guard(rewriter);
+ Value one = createConst(1);
+ Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
+ Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
+ Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
+ Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
+ Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
+ Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
+ launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
+ blkSizeX, blkSizeY, blkSizeZ);
+ rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
+ rewriter.create<TerminatorOp>(loc);
+ return DiagnosedSilenceableFailure::success();
+}
+
+/// Alter kernel configuration of the given kernel.
+DiagnosedSilenceableFailure alterGpuLaunch(
+ RewriterBase &rewriter, LaunchOp gpuLaunch,
+ TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
+ std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
+ std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
+ std::optional<int64_t> blockDimZ) {
+ DiagnosedSilenceableFailure diag =
+ checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
+ blockDimY, blockDimZ);
+ if (!diag.succeeded())
+ return diag;
+
+ KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
+ OpBuilder::InsertionGuard guard(rewriter);
+ rewriter.setInsertionPointAfterValue(currentBlockdim.x);
+ auto createConstValue = [&](int dim) {
+ return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
+ dim);
+ };
+
+ if (gridDimX.has_value())
+ gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
+ if (gridDimY.has_value())
+ gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
+ if (gridDimZ.has_value())
+ gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
+ if (blockDimX.has_value())
+ gpuLaunch.getBlockSizeXMutable().assign(
+ createConstValue(blockDimX.value()));
+ if (blockDimY.has_value())
+ gpuLaunch.getBlockSizeYMutable().assign(
+ createConstValue(blockDimY.value()));
+ if (blockDimZ.has_value())
+ gpuLaunch.getBlockSizeZMutable().assign(
+ createConstValue(blockDimZ.value()));
+ return DiagnosedSilenceableFailure::success();
+}
+
+DiagnosedSilenceableFailure
+findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
+ TransformOpInterface transformOp) {
+ auto walkResult = target->walk([&](scf::ForallOp forallOp) {
+ if (forallOp->getParentOfType<scf::ForallOp>())
+ return WalkResult::advance();
+ if (topLevelForallOp)
+ // TODO: Handle multiple forall if they are independent.
+ return WalkResult::interrupt();
+ topLevelForallOp = forallOp;
+ return WalkResult::advance();
+ });
+
+ if (walkResult.wasInterrupted())
+ return transformOp.emitSilenceableError()
+ << "could not find a unique topLevel scf.forall";
+ return DiagnosedSilenceableFailure::success();
+}
+
+} // namespace gpu
+} // namespace transform
+} // namespace mlir
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index 7db793b766a1b1..2e0bafb4fc6545 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -121,6 +121,20 @@ std::optional<int64_t> getConstantIntValue(OpFoldResult ofr) {
return std::nullopt;
}
+std::optional<SmallVector<int64_t>>
+getConstantIntValues(ArrayRef<OpFoldResult> ofrs) {
+ bool failed = false;
+ SmallVector<int64_t> res = llvm::map_to_vector(ofrs, [&](OpFoldResult ofr) {
+ auto cv = getConstantIntValue(ofr);
+ if (!cv.has_value())
+ failed = true;
+ return cv.has_value() ? cv.value() : 0;
+ });
+ if (failed)
+ return std::nullopt;
+ return res;
+}
+
/// Return true if `ofr` is constant integer equal to `value`.
bool isConstantIntValue(OpFoldResult ofr, int64_t value) {
auto val = getConstantIntValue(ofr);
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index d1601a3e5937f3..2d110883f57d39 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4747,12 +4747,12 @@ gentbl_cc_library(
cc_library(
name = "GPUTransformOps",
- srcs = [
- "lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp",
- ],
- hdrs = [
- "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h",
- ],
+ srcs = glob([
+ "lib/Dialect/GPU/TransformOps/*.cpp",
+ ]),
+ hdrs = glob([
+ "include/mlir/Dialect/GPU/TransformOps/*.h",
+ ]),
includes = ["include"],
deps = [
":AffineDialect",
More information about the Mlir-commits
mailing list